OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "Shared/quantile.h"
27 #include "TypePunning.h"
28 #include "Utils/SegmentTreeUtils.h"
29 
30 #include <algorithm>
31 #include <atomic>
32 #include <chrono>
33 #include <cmath>
34 #include <cstdio>
35 #include <cstring>
36 #include <thread>
37 #include <tuple>
38 
39 // decoder implementations
40 
41 #include "DecodersImpl.h"
42 
43 // arithmetic operator implementations
44 
45 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
46  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable( \
47  const type lhs, const type rhs, const null_type null_val) { \
48  if (lhs != null_val && rhs != null_val) { \
49  return lhs opsym rhs; \
50  } \
51  return null_val; \
52  }
53 
54 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
55  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
56  const type lhs, const type rhs, const null_type null_val) { \
57  if (lhs != null_val) { \
58  return lhs opsym rhs; \
59  } \
60  return null_val; \
61  }
62 
63 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
64  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
65  const type lhs, const type rhs, const null_type null_val) { \
66  if (rhs != null_val) { \
67  return lhs opsym rhs; \
68  } \
69  return null_val; \
70  }
71 
72 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
73  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable( \
74  const type lhs, \
75  const type rhs, \
76  const null_type null_val, \
77  const int8_t null_bool_val) { \
78  if (lhs != null_val && rhs != null_val) { \
79  return lhs opsym rhs; \
80  } \
81  return null_bool_val; \
82  }
83 
84 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
85  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
86  const type lhs, \
87  const type rhs, \
88  const null_type null_val, \
89  const int8_t null_bool_val) { \
90  if (lhs != null_val) { \
91  return lhs opsym rhs; \
92  } \
93  return null_bool_val; \
94  }
95 
96 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
97  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
98  const type lhs, \
99  const type rhs, \
100  const null_type null_val, \
101  const int8_t null_bool_val) { \
102  if (rhs != null_val) { \
103  return lhs opsym rhs; \
104  } \
105  return null_bool_val; \
106  }
107 
108 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
109  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type safe_div_##type( \
110  const type lhs, const type rhs, const null_type null_val) { \
111  if (lhs != null_val && rhs != null_val && rhs != 0) { \
112  return lhs / rhs; \
113  } \
114  return null_val; \
115  }
116 
117 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
118  DEF_ARITH_NULLABLE(type, null_type, add, +) \
119  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
120  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
121  DEF_ARITH_NULLABLE(type, null_type, div, /) \
122  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
123  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
124  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
125  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
126  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
127  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
128  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
129  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
130  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
131  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
132  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
133  DEF_CMP_NULLABLE(type, null_type, lt, <) \
134  DEF_CMP_NULLABLE(type, null_type, gt, >) \
135  DEF_CMP_NULLABLE(type, null_type, le, <=) \
136  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
139  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
140  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
141  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
142  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
145  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
146  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
147  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
148  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
149 
150 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
151 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
152 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
153 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
154 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
155 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
156 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
165 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
166 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
167 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
168 
169 #undef DEF_BINARY_NULLABLE_ALL_OPS
170 #undef DEF_SAFE_DIV_NULLABLE
171 #undef DEF_CMP_NULLABLE_RHS
172 #undef DEF_CMP_NULLABLE_LHS
173 #undef DEF_CMP_NULLABLE
174 #undef DEF_ARITH_NULLABLE_RHS
175 #undef DEF_ARITH_NULLABLE_LHS
176 #undef DEF_ARITH_NULLABLE
177 
178 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
179 scale_decimal_up(const int64_t operand,
180  const uint64_t scale,
181  const int64_t operand_null_val,
182  const int64_t result_null_val) {
183  return operand != operand_null_val ? operand * scale : result_null_val;
184 }
185 
186 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
187 scale_decimal_down_nullable(const int64_t operand,
188  const int64_t scale,
189  const int64_t null_val) {
190  // rounded scale down of a decimal
191  if (operand == null_val) {
192  return null_val;
193  }
194 
195  int64_t tmp = scale >> 1;
196  tmp = operand >= 0 ? operand + tmp : operand - tmp;
197  return tmp / scale;
198 }
199 
200 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
201 scale_decimal_down_not_nullable(const int64_t operand,
202  const int64_t scale,
203  const int64_t null_val) {
204  int64_t tmp = scale >> 1;
205  tmp = operand >= 0 ? operand + tmp : operand - tmp;
206  return tmp / scale;
207 }
208 
209 // Return floor(dividend / divisor).
210 // Assumes 0 < divisor.
211 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend,
212  const int64_t divisor) {
213  return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
214 }
215 
216 // Return floor(dividend / divisor) or NULL if dividend IS NULL.
217 // Assumes 0 < divisor.
218 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
219 floor_div_nullable_lhs(const int64_t dividend,
220  const int64_t divisor,
221  const int64_t null_val) {
222  return dividend == null_val ? null_val : floor_div_lhs(dividend, divisor);
223 }
224 
225 #define DEF_UMINUS_NULLABLE(type, null_type) \
226  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type uminus_##type##_nullable( \
227  const type operand, const null_type null_val) { \
228  return operand == null_val ? null_val : -operand; \
229  }
230 
231 DEF_UMINUS_NULLABLE(int8_t, int8_t)
232 DEF_UMINUS_NULLABLE(int16_t, int16_t)
233 DEF_UMINUS_NULLABLE(int32_t, int32_t)
234 DEF_UMINUS_NULLABLE(int64_t, int64_t)
235 DEF_UMINUS_NULLABLE(float, float)
236 DEF_UMINUS_NULLABLE(double, double)
237 
238 #undef DEF_UMINUS_NULLABLE
239 
240 #define DEF_CAST_NULLABLE(from_type, to_type) \
241  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
242  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
243  const from_type from_null_val, \
244  const to_type to_null_val) { \
245  return operand == from_null_val ? to_null_val : operand; \
246  }
247 
248 #define DEF_CAST_SCALED_NULLABLE(from_type, to_type) \
249  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
250  cast_##from_type##_to_##to_type##_scaled_nullable(const from_type operand, \
251  const from_type from_null_val, \
252  const to_type to_null_val, \
253  const to_type multiplier) { \
254  return operand == from_null_val ? to_null_val : multiplier * operand; \
255  }
256 
257 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
258  DEF_CAST_NULLABLE(type1, type2) \
259  DEF_CAST_NULLABLE(type2, type1)
260 
261 #define DEF_ROUND_NULLABLE(from_type, to_type) \
262  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
263  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
264  const from_type from_null_val, \
265  const to_type to_null_val) { \
266  return operand == from_null_val \
267  ? to_null_val \
268  : static_cast<to_type>(operand + (operand < from_type(0) \
269  ? from_type(-0.5) \
270  : from_type(0.5))); \
271  }
272 
273 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
274 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
275 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
276 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
277 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
278 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
279 DEF_CAST_NULLABLE_BIDIR(float, double)
280 
281 DEF_CAST_NULLABLE(int8_t, float)
282 DEF_CAST_NULLABLE(int16_t, float)
283 DEF_CAST_NULLABLE(int32_t, float)
284 DEF_CAST_NULLABLE(int64_t, float)
285 DEF_CAST_NULLABLE(int8_t, double)
286 DEF_CAST_NULLABLE(int16_t, double)
287 DEF_CAST_NULLABLE(int32_t, double)
288 DEF_CAST_NULLABLE(int64_t, double)
289 
290 DEF_ROUND_NULLABLE(float, int8_t)
291 DEF_ROUND_NULLABLE(float, int16_t)
292 DEF_ROUND_NULLABLE(float, int32_t)
293 DEF_ROUND_NULLABLE(float, int64_t)
294 DEF_ROUND_NULLABLE(double, int8_t)
295 DEF_ROUND_NULLABLE(double, int16_t)
296 DEF_ROUND_NULLABLE(double, int32_t)
297 DEF_ROUND_NULLABLE(double, int64_t)
298 
299 DEF_CAST_NULLABLE(uint8_t, int32_t)
300 DEF_CAST_NULLABLE(uint16_t, int32_t)
301 DEF_CAST_SCALED_NULLABLE(int64_t, float)
302 DEF_CAST_SCALED_NULLABLE(int64_t, double)
303 
304 #undef DEF_ROUND_NULLABLE
305 #undef DEF_CAST_NULLABLE_BIDIR
306 #undef DEF_CAST_SCALED_NULLABLE
307 #undef DEF_CAST_NULLABLE
308 
309 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand,
310  const int8_t null_val) {
311  return operand == null_val ? operand : (operand ? 0 : 1);
312 }
313 
314 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
315  const int8_t rhs,
316  const int8_t null_val) {
317  if (lhs == null_val) {
318  return rhs == 0 ? rhs : null_val;
319  }
320  if (rhs == null_val) {
321  return lhs == 0 ? lhs : null_val;
322  }
323  return (lhs && rhs) ? 1 : 0;
324 }
325 
326 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
327  const int8_t rhs,
328  const int8_t null_val) {
329  if (lhs == null_val) {
330  return rhs == 0 ? null_val : rhs;
331  }
332  if (rhs == null_val) {
333  return lhs == 0 ? null_val : lhs;
334  }
335  return (lhs || rhs) ? 1 : 0;
336 }
337 
338 // aggregator implementations
339 
340 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
341  return (*agg)++;
342 }
343 
344 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void
345 agg_count_distinct_bitmap(int64_t* agg, const int64_t val, const int64_t min_val) {
346  const uint64_t bitmap_idx = val - min_val;
347  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
348 }
349 
350 #ifdef _MSC_VER
351 #define GPU_RT_STUB NEVER_INLINE
352 #else
353 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
354 #endif
355 
357  const int64_t,
358  const int64_t,
359  const int64_t,
360  const int64_t,
361  const uint64_t,
362  const uint64_t) {}
363 
364 extern "C" RUNTIME_EXPORT NEVER_INLINE void
365 agg_approximate_count_distinct(int64_t* agg, const int64_t key, const uint32_t b) {
366  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
367  const uint32_t index = hash >> (64 - b);
368  const uint8_t rank = get_rank(hash << b, 64 - b);
369  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
370  M[index] = std::max(M[index], rank);
371 }
372 
374  const int64_t,
375  const uint32_t,
376  const int64_t,
377  const int64_t) {}
378 
379 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
380  const int64_t val,
381  const int64_t min_val,
382  const int64_t max_val,
383  const int64_t null_val,
384  const int8_t null_bool_val) {
385  if (val == null_val) {
386  return null_bool_val;
387  }
388  if (val < min_val || val > max_val) {
389  return 0;
390  }
391  if (!bitset) {
392  return 0;
393  }
394  const uint64_t bitmap_idx = val - min_val;
395  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
396  (1 << (bitmap_idx & 7))
397  ? 1
398  : 0;
399 }
400 
401 #define DEF_COMPUTE_LOWER_BOUND(value_type) \
402  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_##value_type##_lower_bound( \
403  const int64_t entry_cnt, \
404  const value_type target_value, \
405  const value_type* col_buf) { \
406  int64_t l = 0; \
407  int64_t h = entry_cnt - 1; \
408  while (l < h) { \
409  int64_t mid = l + (h - l) / 2; \
410  if (target_value < col_buf[mid]) { \
411  h = mid; \
412  } else { \
413  l = mid + 1; \
414  } \
415  } \
416  return l; \
417  }
424 #undef DEF_COMPUTE_LOWER_BOUND
425 
426 #define DEF_COMPUTE_UPPER_BOUND(value_type) \
427  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_##value_type##_upper_bound( \
428  const int64_t entry_cnt, \
429  const value_type target_value, \
430  const value_type* col_buf) { \
431  int64_t l = 0; \
432  int64_t h = entry_cnt - 1; \
433  while (l < h) { \
434  int64_t mid = l + (h - l) / 2; \
435  if (target_value >= col_buf[mid]) { \
436  h = mid; \
437  } else { \
438  l = mid + 1; \
439  } \
440  } \
441  return l; \
442  }
449 #undef DEF_COMPUTE_UPPER_BOUND
450 
451 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
452 get_valid_buf_start_pos(const int64_t null_start_pos, const int64_t null_end_pos) {
453  return null_start_pos == 0 ? null_end_pos + 1 : 0;
454 }
455 
456 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
457 get_valid_buf_end_pos(const int64_t num_elems,
458  const int64_t null_start_pos,
459  const int64_t null_end_pos) {
460  return null_end_pos == num_elems ? null_start_pos : num_elems;
461 }
462 
463 #define DEF_COMPUTE_LOWER_BOUND_FROM_ORDERED_INDEX(value_type) \
464  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
465  compute_##value_type##_lower_bound_from_ordered_index( \
466  const int64_t num_elems, \
467  const value_type target_value, \
468  const value_type* col_buf, \
469  const int32_t* partition_rowid_buf, \
470  const int64_t* ordered_index_buf, \
471  const value_type null_val, \
472  const int64_t null_start_pos, \
473  const int64_t null_end_pos) { \
474  if (target_value == null_val) { \
475  return null_start_pos; \
476  } \
477  int64_t l = get_valid_buf_start_pos(null_start_pos, null_end_pos); \
478  int64_t h = get_valid_buf_end_pos(num_elems, null_start_pos, null_end_pos); \
479  while (l < h) { \
480  int64_t mid = l + (h - l) / 2; \
481  if (target_value <= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) { \
482  h = mid; \
483  } else { \
484  l = mid + 1; \
485  } \
486  } \
487  return h; \
488  }
495 #undef DEF_COMPUTE_LOWER_BOUND_FROM_ORDERED_INDEX
496 
497 #define DEF_COMPUTE_UPPER_BOUND_FROM_ORDERED_INDEX(value_type) \
498  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
499  compute_##value_type##_upper_bound_from_ordered_index( \
500  const int64_t num_elems, \
501  const value_type target_value, \
502  const value_type* col_buf, \
503  const int32_t* partition_rowid_buf, \
504  const int64_t* ordered_index_buf, \
505  const value_type null_val, \
506  const int64_t null_start_pos, \
507  const int64_t null_end_pos) { \
508  if (target_value == null_val) { \
509  return null_end_pos; \
510  } \
511  int64_t l = get_valid_buf_start_pos(null_start_pos, null_end_pos); \
512  int64_t h = get_valid_buf_end_pos(num_elems, null_start_pos, null_end_pos); \
513  while (l < h) { \
514  int64_t mid = l + (h - l) / 2; \
515  if (target_value >= col_buf[partition_rowid_buf[ordered_index_buf[mid]]]) { \
516  l = mid + 1; \
517  } else { \
518  h = mid; \
519  } \
520  } \
521  return l; \
522  }
529 #undef DEF_COMPUTE_UPPER_BOUND_FROM_ORDERED_INDEX
530 
531 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
532 compute_row_mode_start_index_sub(int64_t candidate_index,
533  int64_t current_partition_start_offset,
534  int64_t frame_bound) {
535  int64_t index = candidate_index - current_partition_start_offset - frame_bound;
536  return index < 0 ? 0 : index;
537 }
538 
539 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
540 compute_row_mode_start_index_add(int64_t candidate_index,
541  int64_t current_partition_start_offset,
542  int64_t frame_bound,
543  int64_t num_current_partition_elem) {
544  int64_t index = candidate_index - current_partition_start_offset + frame_bound;
545  return index >= num_current_partition_elem ? num_current_partition_elem : index;
546 }
547 
548 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
549 compute_row_mode_end_index_sub(int64_t candidate_index,
550  int64_t current_partition_start_offset,
551  int64_t frame_bound) {
552  int64_t index = candidate_index - current_partition_start_offset - frame_bound;
553  return index < 0 ? 0 : index + 1;
554 }
555 
556 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
557 compute_row_mode_end_index_add(int64_t candidate_index,
558  int64_t current_partition_start_offset,
559  int64_t frame_bound,
560  int64_t num_current_partition_elem) {
561  int64_t index = candidate_index - current_partition_start_offset + frame_bound;
562  return index >= num_current_partition_elem ? num_current_partition_elem : index + 1;
563 }
564 
565 #define DEF_RANGE_MODE_FRAME_LOWER_BOUND(value_type, opname, opsym) \
566  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
567  range_mode_##value_type##_##opname##_frame_lower_bound( \
568  const int64_t num_elems, \
569  const value_type target_value, \
570  const value_type* col_buf, \
571  const int32_t* partition_rowid_buf, \
572  const int64_t* ordered_index_buf, \
573  const int64_t frame_bound_val, \
574  const value_type null_val, \
575  const int64_t null_start_pos, \
576  const int64_t null_end_pos) { \
577  if (target_value == null_val) { \
578  return null_start_pos; \
579  } else { \
580  value_type new_val = target_value opsym frame_bound_val; \
581  return compute_##value_type##_lower_bound_from_ordered_index(num_elems, \
582  new_val, \
583  col_buf, \
584  partition_rowid_buf, \
585  ordered_index_buf, \
586  null_val, \
587  null_start_pos, \
588  null_end_pos); \
589  } \
590  }
591 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int8_t, add, +)
592 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int8_t, sub, -)
593 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, add, +)
594 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int16_t, sub, -)
595 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, add, +)
596 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int32_t, sub, -)
597 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, add, +)
598 DEF_RANGE_MODE_FRAME_LOWER_BOUND(int64_t, sub, -)
601 DEF_RANGE_MODE_FRAME_LOWER_BOUND(double, add, +)
602 DEF_RANGE_MODE_FRAME_LOWER_BOUND(double, sub, -)
603 #undef DEF_RANGE_MODE_FRAME_LOWER_BOUND
604 
605 #define DEF_RANGE_MODE_FRAME_UPPER_BOUND(value_type, opname, opsym) \
606  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t \
607  range_mode_##value_type##_##opname##_frame_upper_bound( \
608  const int64_t num_elems, \
609  const value_type target_value, \
610  const value_type* col_buf, \
611  const int32_t* partition_rowid_buf, \
612  const int64_t* ordered_index_buf, \
613  const int64_t frame_bound_val, \
614  const value_type null_val, \
615  const int64_t null_start_pos, \
616  const int64_t null_end_pos) { \
617  if (target_value == null_val) { \
618  return null_end_pos; \
619  } else { \
620  value_type new_val = target_value opsym frame_bound_val; \
621  return compute_##value_type##_upper_bound_from_ordered_index(num_elems, \
622  new_val, \
623  col_buf, \
624  partition_rowid_buf, \
625  ordered_index_buf, \
626  null_val, \
627  null_start_pos, \
628  null_end_pos); \
629  } \
630  }
631 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int8_t, add, +)
632 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int8_t, sub, -)
633 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, add, +)
634 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int16_t, sub, -)
635 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, add, +)
636 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int32_t, sub, -)
637 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, add, +)
638 DEF_RANGE_MODE_FRAME_UPPER_BOUND(int64_t, sub, -)
641 DEF_RANGE_MODE_FRAME_UPPER_BOUND(double, add, +)
642 DEF_RANGE_MODE_FRAME_UPPER_BOUND(double, sub, -)
643 #undef DEF_RANGE_MODE_FRAME_UPPER_BOUND
644 
646  int64_t** aggregation_trees,
647  size_t partition_idx) {
648  return aggregation_trees[partition_idx];
649 }
650 
652  int64_t** aggregation_trees,
653  size_t partition_idx) {
654  double** casted_aggregation_trees = reinterpret_cast<double**>(aggregation_trees);
655  return casted_aggregation_trees[partition_idx];
656 }
657 
659 get_integer_derived_aggregation_tree(int64_t** aggregation_trees, size_t partition_idx) {
660  SumAndCountPair<int64_t>** casted_aggregation_trees =
661  reinterpret_cast<SumAndCountPair<int64_t>**>(aggregation_trees);
662  return casted_aggregation_trees[partition_idx];
663 }
664 
666 get_double_derived_aggregation_tree(int64_t** aggregation_trees, size_t partition_idx) {
667  SumAndCountPair<double>** casted_aggregation_trees =
668  reinterpret_cast<SumAndCountPair<double>**>(aggregation_trees);
669  return casted_aggregation_trees[partition_idx];
670 }
671 
672 extern "C" RUNTIME_EXPORT ALWAYS_INLINE size_t
673 getStartOffsetForSegmentTreeTraversal(size_t level, size_t tree_fanout) {
674  size_t offset = 0;
675  for (size_t i = 0; i < level; i++) {
676  offset += pow(tree_fanout, i);
677  }
678  return offset;
679 }
680 
681 #define DEF_COMPUTE_FRAME_MIN(agg_value_type) \
682  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
683  compute_##agg_value_type##_frame_min( \
684  agg_value_type* aggregation_tree_for_partition, \
685  size_t query_range_start_idx, \
686  size_t query_range_end_idx, \
687  size_t leaf_level, \
688  size_t tree_fanout, \
689  agg_value_type invalid_val, \
690  agg_value_type null_val) { \
691  size_t leaf_start_idx = \
692  getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout); \
693  size_t begin = leaf_start_idx + query_range_start_idx; \
694  size_t end = leaf_start_idx + query_range_end_idx; \
695  agg_value_type res = std::numeric_limits<agg_value_type>::max(); \
696  bool all_nulls = true; \
697  for (size_t level = leaf_level; level >= 0; level--) { \
698  size_t parentBegin = begin / tree_fanout; \
699  size_t parentEnd = (end - 1) / tree_fanout; \
700  if (parentBegin == parentEnd) { \
701  for (size_t pos = begin; pos < end; pos++) { \
702  if (aggregation_tree_for_partition[pos] != null_val) { \
703  all_nulls = false; \
704  res = std::min(res, aggregation_tree_for_partition[pos]); \
705  } \
706  } \
707  return all_nulls ? null_val : res; \
708  } \
709  size_t group_begin = (parentBegin * tree_fanout) + 1; \
710  if (begin != group_begin) { \
711  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1; \
712  for (size_t pos = begin; pos < limit; pos++) { \
713  if (aggregation_tree_for_partition[pos] != null_val) { \
714  all_nulls = false; \
715  res = std::min(res, aggregation_tree_for_partition[pos]); \
716  } \
717  } \
718  parentBegin++; \
719  } \
720  size_t group_end = (parentEnd * tree_fanout) + 1; \
721  if (end != group_end) { \
722  for (size_t pos = group_end; pos < end; pos++) { \
723  if (aggregation_tree_for_partition[pos] != null_val) { \
724  all_nulls = false; \
725  res = std::min(res, aggregation_tree_for_partition[pos]); \
726  } \
727  } \
728  } \
729  begin = parentBegin; \
730  end = parentEnd; \
731  } \
732  return invalid_val; \
733  }
734 DEF_COMPUTE_FRAME_MIN(int64_t)
735 DEF_COMPUTE_FRAME_MIN(double)
736 #undef DEF_COMPUTE_FRAME_MIN
737 
738 #define DEF_COMPUTE_FRAME_MAX(agg_value_type) \
739  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
740  compute_##agg_value_type##_frame_max( \
741  agg_value_type* aggregation_tree_for_partition, \
742  size_t query_range_start_idx, \
743  size_t query_range_end_idx, \
744  size_t leaf_level, \
745  size_t tree_fanout, \
746  agg_value_type invalid_val, \
747  agg_value_type null_val) { \
748  size_t leaf_start_idx = \
749  getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout); \
750  size_t begin = leaf_start_idx + query_range_start_idx; \
751  size_t end = leaf_start_idx + query_range_end_idx; \
752  agg_value_type res = std::numeric_limits<agg_value_type>::lowest(); \
753  bool all_nulls = true; \
754  for (size_t level = leaf_level; level >= 0; level--) { \
755  size_t parentBegin = begin / tree_fanout; \
756  size_t parentEnd = (end - 1) / tree_fanout; \
757  if (parentBegin == parentEnd) { \
758  for (size_t pos = begin; pos < end; pos++) { \
759  if (aggregation_tree_for_partition[pos] != null_val) { \
760  all_nulls = false; \
761  res = std::max(res, aggregation_tree_for_partition[pos]); \
762  } \
763  } \
764  return all_nulls ? null_val : res; \
765  } \
766  size_t group_begin = (parentBegin * tree_fanout) + 1; \
767  if (begin != group_begin) { \
768  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1; \
769  for (size_t pos = begin; pos < limit; pos++) { \
770  if (aggregation_tree_for_partition[pos] != null_val) { \
771  all_nulls = false; \
772  res = std::max(res, aggregation_tree_for_partition[pos]); \
773  } \
774  } \
775  parentBegin++; \
776  } \
777  size_t group_end = (parentEnd * tree_fanout) + 1; \
778  if (end != group_end) { \
779  for (size_t pos = group_end; pos < end; pos++) { \
780  if (aggregation_tree_for_partition[pos] != null_val) { \
781  all_nulls = false; \
782  res = std::max(res, aggregation_tree_for_partition[pos]); \
783  } \
784  } \
785  } \
786  begin = parentBegin; \
787  end = parentEnd; \
788  } \
789  return invalid_val; \
790  }
791 DEF_COMPUTE_FRAME_MAX(int64_t)
792 DEF_COMPUTE_FRAME_MAX(double)
793 #undef DEF_COMPUTE_FRAME_MAX
794 
795 #define DEF_COMPUTE_FRAME_SUM(agg_value_type) \
796  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
797  compute_##agg_value_type##_frame_sum( \
798  agg_value_type* aggregation_tree_for_partition, \
799  size_t query_range_start_idx, \
800  size_t query_range_end_idx, \
801  size_t leaf_level, \
802  size_t tree_fanout, \
803  agg_value_type invalid_val, \
804  agg_value_type null_val) { \
805  size_t leaf_start_idx = \
806  getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout); \
807  size_t begin = leaf_start_idx + query_range_start_idx; \
808  size_t end = leaf_start_idx + query_range_end_idx; \
809  agg_value_type res = 0; \
810  bool all_nulls = true; \
811  for (size_t level = leaf_level; level >= 0; level--) { \
812  size_t parentBegin = begin / tree_fanout; \
813  size_t parentEnd = (end - 1) / tree_fanout; \
814  if (parentBegin == parentEnd) { \
815  for (size_t pos = begin; pos < end; pos++) { \
816  if (aggregation_tree_for_partition[pos] != null_val) { \
817  all_nulls = false; \
818  res += aggregation_tree_for_partition[pos]; \
819  } \
820  } \
821  return all_nulls ? null_val : res; \
822  } \
823  size_t group_begin = (parentBegin * tree_fanout) + 1; \
824  if (begin != group_begin) { \
825  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1; \
826  for (size_t pos = begin; pos < limit; pos++) { \
827  if (aggregation_tree_for_partition[pos] != null_val) { \
828  all_nulls = false; \
829  res += aggregation_tree_for_partition[pos]; \
830  } \
831  } \
832  parentBegin++; \
833  } \
834  size_t group_end = (parentEnd * tree_fanout) + 1; \
835  if (end != group_end) { \
836  for (size_t pos = group_end; pos < end; pos++) { \
837  if (aggregation_tree_for_partition[pos] != null_val) { \
838  all_nulls = false; \
839  res += aggregation_tree_for_partition[pos]; \
840  } \
841  } \
842  } \
843  begin = parentBegin; \
844  end = parentEnd; \
845  } \
846  return invalid_val; \
847  }
848 DEF_COMPUTE_FRAME_SUM(int64_t)
849 DEF_COMPUTE_FRAME_SUM(double)
850 #undef DEF_COMPUTE_FRAME_SUM
851 
852 #define DEF_COMPUTE_SUM_AND_COUNT_FRAME_AGGREGATES(agg_value_type) \
853  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void \
854  compute_##agg_value_type##_sum_and_count_frame_aggregates( \
855  SumAndCountPair<agg_value_type>* aggregation_tree_for_partition, \
856  SumAndCountPair<agg_value_type>& res, \
857  size_t query_range_start_idx, \
858  size_t query_range_end_idx, \
859  size_t leaf_level, \
860  size_t tree_fanout, \
861  agg_value_type invalid_val, \
862  agg_value_type null_val) { \
863  size_t leaf_start_idx = \
864  getStartOffsetForSegmentTreeTraversal(leaf_level, tree_fanout); \
865  size_t begin = leaf_start_idx + query_range_start_idx; \
866  size_t end = leaf_start_idx + query_range_end_idx; \
867  bool all_nulls = true; \
868  for (size_t level = leaf_level; level >= 0; level--) { \
869  size_t parentBegin = begin / tree_fanout; \
870  size_t parentEnd = (end - 1) / tree_fanout; \
871  if (parentBegin == parentEnd) { \
872  for (size_t pos = begin; pos < end; pos++) { \
873  if (aggregation_tree_for_partition[pos].sum != null_val) { \
874  all_nulls = false; \
875  res.sum += aggregation_tree_for_partition[pos].sum; \
876  res.count += aggregation_tree_for_partition[pos].count; \
877  } \
878  } \
879  if (all_nulls) { \
880  res.sum = null_val; \
881  res.count = 0; \
882  } \
883  return; \
884  } \
885  size_t group_begin = (parentBegin * tree_fanout) + 1; \
886  if (begin != group_begin) { \
887  size_t limit = (parentBegin * tree_fanout) + tree_fanout + 1; \
888  for (size_t pos = begin; pos < limit; pos++) { \
889  if (aggregation_tree_for_partition[pos].sum != null_val) { \
890  all_nulls = false; \
891  res.sum += aggregation_tree_for_partition[pos].sum; \
892  res.count += aggregation_tree_for_partition[pos].count; \
893  } \
894  } \
895  parentBegin++; \
896  } \
897  size_t group_end = (parentEnd * tree_fanout) + 1; \
898  if (end != group_end) { \
899  for (size_t pos = group_end; pos < end; pos++) { \
900  if (aggregation_tree_for_partition[pos].sum != null_val) { \
901  all_nulls = false; \
902  res.sum += aggregation_tree_for_partition[pos].sum; \
903  res.count += aggregation_tree_for_partition[pos].count; \
904  } \
905  } \
906  } \
907  begin = parentBegin; \
908  end = parentEnd; \
909  } \
910  res.sum = invalid_val; \
911  res.count = 0; \
912  return; \
913  }
916 #undef DEF_COMPUTE_SUM_AND_COUNT_FRAME_AGGREGATES
917 
918 #define DEF_SEARCH_AGGREGATION_TREE(agg_value_type) \
919  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_value_type \
920  search_##agg_value_type##_aggregation_tree( \
921  agg_value_type* aggregated_tree_for_partition, \
922  size_t query_range_start_idx, \
923  size_t query_range_end_idx, \
924  size_t leaf_level, \
925  size_t tree_fanout, \
926  bool decimal_type, \
927  size_t scale, \
928  agg_value_type invalid_val, \
929  agg_value_type null_val, \
930  int32_t agg_type) { \
931  if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
932  return null_val; \
933  } \
934  switch (agg_type) { \
935  case 1: { \
936  return compute_##agg_value_type##_frame_min(aggregated_tree_for_partition, \
937  query_range_start_idx, \
938  query_range_end_idx, \
939  leaf_level, \
940  tree_fanout, \
941  invalid_val, \
942  null_val); \
943  } \
944  case 2: { \
945  return compute_##agg_value_type##_frame_max(aggregated_tree_for_partition, \
946  query_range_start_idx, \
947  query_range_end_idx, \
948  leaf_level, \
949  tree_fanout, \
950  invalid_val, \
951  null_val); \
952  } \
953  default: { \
954  return compute_##agg_value_type##_frame_sum(aggregated_tree_for_partition, \
955  query_range_start_idx, \
956  query_range_end_idx, \
957  leaf_level, \
958  tree_fanout, \
959  invalid_val, \
960  null_val); \
961  } \
962  } \
963  }
964 
967 #undef DEF_SEARCH_AGGREGATION_TREE
968 
969 #define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type) \
970  extern "C" RUNTIME_EXPORT ALWAYS_INLINE double \
971  search_##agg_value_type##_derived_aggregation_tree( \
972  SumAndCountPair<agg_value_type>* aggregated_tree_for_partition, \
973  size_t query_range_start_idx, \
974  size_t query_range_end_idx, \
975  size_t leaf_level, \
976  size_t tree_fanout, \
977  bool decimal_type, \
978  size_t scale, \
979  agg_value_type invalid_val, \
980  agg_value_type null_val, \
981  int32_t agg_type) { \
982  if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
983  return null_val; \
984  } \
985  SumAndCountPair<agg_value_type> res{0, 0}; \
986  compute_##agg_value_type##_sum_and_count_frame_aggregates( \
987  aggregated_tree_for_partition, \
988  res, \
989  query_range_start_idx, \
990  query_range_end_idx, \
991  leaf_level, \
992  tree_fanout, \
993  invalid_val, \
994  null_val); \
995  if (res.sum == null_val) { \
996  return null_val; \
997  } else if (res.count > 0) { \
998  if (decimal_type) { \
999  return (static_cast<double>(res.sum) / pow(10, scale)) / res.count; \
1000  } \
1001  return (static_cast<double>(res.sum)) / res.count; \
1002  } else { \
1003  return invalid_val; \
1004  } \
1005  }
1006 
1009 #undef DEF_SEARCH_DERIVED_AGGREGATION_TREE
1010 
1011 #define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type) \
1012  extern "C" RUNTIME_EXPORT ALWAYS_INLINE agg_type \
1013  handle_null_val_##agg_type##_##null_type##_window_framing_agg( \
1014  agg_type res, null_type agg_null_val, agg_type input_col_null_val) { \
1015  if (res == agg_null_val) { \
1016  return input_col_null_val; \
1017  } \
1018  return res; \
1019  }
1023 #undef DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG
1024 
1025 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
1026  const auto old = *agg;
1027  *agg += val;
1028  return old;
1029 }
1030 
1031 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
1032  *agg = std::max(*agg, val);
1033 }
1034 
1035 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
1036  *agg = std::min(*agg, val);
1037 }
1038 
1039 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
1040  *agg = val;
1041 }
1042 
1043 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t* agg_id_varlen(int8_t* varlen_buffer,
1044  const int64_t offset,
1045  const int8_t* value,
1046  const int64_t size_bytes) {
1047  for (auto i = 0; i < size_bytes; i++) {
1048  varlen_buffer[offset + i] = value[i];
1049  }
1050  return &varlen_buffer[offset];
1051 }
1052 
1053 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1054 checked_single_agg_id(int64_t* agg, const int64_t val, const int64_t null_val) {
1055  if (val == null_val) {
1056  return 0;
1057  }
1058 
1059  if (*agg == val) {
1060  return 0;
1061  } else if (*agg == null_val) {
1062  *agg = val;
1063  return 0;
1064  } else {
1065  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1066  return 15;
1067  }
1068 }
1069 
1071  int64_t* agg,
1072  const int64_t val,
1073  const int64_t min_val,
1074  const int64_t skip_val) {
1075  if (val != skip_val) {
1076  agg_count_distinct_bitmap(agg, val, min_val);
1077  }
1078 }
1079 
1081  const int64_t,
1082  const int64_t,
1083  const int64_t,
1084  const int64_t,
1085  const int64_t,
1086  const uint64_t,
1087  const uint64_t) {}
1088 
1089 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg,
1090  const int32_t) {
1091  return (*agg)++;
1092 }
1093 
1094 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg,
1095  const int32_t val) {
1096  const auto old = *agg;
1097  *agg += val;
1098  return old;
1099 }
1100 
1101 #define DEF_AGG_MAX_INT(n) \
1102  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, \
1103  const int##n##_t val) { \
1104  *agg = std::max(*agg, val); \
1105  }
1106 
1107 DEF_AGG_MAX_INT(32)
1108 DEF_AGG_MAX_INT(16)
1109 DEF_AGG_MAX_INT(8)
1110 #undef DEF_AGG_MAX_INT
1111 
1112 #define DEF_AGG_MIN_INT(n) \
1113  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, \
1114  const int##n##_t val) { \
1115  *agg = std::min(*agg, val); \
1116  }
1117 
1118 DEF_AGG_MIN_INT(32)
1119 DEF_AGG_MIN_INT(16)
1120 DEF_AGG_MIN_INT(8)
1121 #undef DEF_AGG_MIN_INT
1122 
1123 #define DEF_AGG_ID_INT(n) \
1124  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, \
1125  const int##n##_t val) { \
1126  *agg = val; \
1127  }
1128 
1129 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
1130  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
1131  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
1132  if (val == null_val) { \
1133  return 0; \
1134  } \
1135  if (*agg == val) { \
1136  return 0; \
1137  } else if (*agg == null_val) { \
1138  *agg = val; \
1139  return 0; \
1140  } else { \
1141  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
1142  return 15; \
1143  } \
1144  }
1145 
1146 DEF_AGG_ID_INT(32)
1147 DEF_AGG_ID_INT(16)
1148 DEF_AGG_ID_INT(8)
1149 
1153 
1154 #undef DEF_AGG_ID_INT
1155 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
1156 
1157 #define DEF_WRITE_PROJECTION_INT(n) \
1158  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void write_projection_int##n( \
1159  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
1160  if (val != init_val) { \
1161  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
1162  } \
1163  }
1164 
1167 #undef DEF_WRITE_PROJECTION_INT
1168 
1169 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
1170  const int64_t val,
1171  const int64_t skip_val) {
1172  const auto old = *agg;
1173  if (val != skip_val) {
1174  if (old != skip_val) {
1175  return agg_sum(agg, val);
1176  } else {
1177  *agg = val;
1178  }
1179  }
1180  return old;
1181 }
1182 
1183 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1184 agg_sum_int32_skip_val(int32_t* agg, const int32_t val, const int32_t skip_val) {
1185  const auto old = *agg;
1186  if (val != skip_val) {
1187  if (old != skip_val) {
1188  return agg_sum_int32(agg, val);
1189  } else {
1190  *agg = val;
1191  }
1192  }
1193  return old;
1194 }
1195 
1196 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
1197 agg_count_skip_val(uint64_t* agg, const int64_t val, const int64_t skip_val) {
1198  if (val != skip_val) {
1199  return agg_count(agg, val);
1200  }
1201  return *agg;
1202 }
1203 
1204 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
1205 agg_count_int32_skip_val(uint32_t* agg, const int32_t val, const int32_t skip_val) {
1206  if (val != skip_val) {
1207  return agg_count_int32(agg, val);
1208  }
1209  return *agg;
1210 }
1211 
1212 #define DEF_SKIP_AGG_ADD(base_agg_func) \
1213  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1214  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1215  if (val != skip_val) { \
1216  base_agg_func(agg, val); \
1217  } \
1218  }
1219 
1220 #define DEF_SKIP_AGG(base_agg_func) \
1221  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1222  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
1223  if (val != skip_val) { \
1224  const DATA_T old_agg = *agg; \
1225  if (old_agg != skip_val) { \
1226  base_agg_func(agg, val); \
1227  } else { \
1228  *agg = val; \
1229  } \
1230  } \
1231  }
1232 
1233 #define DATA_T int64_t
1236 #undef DATA_T
1237 
1238 #define DATA_T int32_t
1241 #undef DATA_T
1242 
1243 #define DATA_T int16_t
1246 #undef DATA_T
1247 
1248 #define DATA_T int8_t
1251 #undef DATA_T
1252 
1253 #undef DEF_SKIP_AGG_ADD
1254 #undef DEF_SKIP_AGG
1255 
1256 // TODO(alex): fix signature
1257 
1258 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg,
1259  const double val) {
1260  return (*agg)++;
1261 }
1262 
1263 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t* agg,
1264  const double val) {
1265  const auto r = *reinterpret_cast<const double*>(agg) + val;
1266  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
1267 }
1268 
1269 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t* agg,
1270  const double val) {
1271  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
1272  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
1273 }
1274 
1275 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t* agg,
1276  const double val) {
1277  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
1278  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
1279 }
1280 
1281 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t* agg,
1282  const double val) {
1283  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
1284 }
1285 
1286 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1287 checked_single_agg_id_double(int64_t* agg, const double val, const double null_val) {
1288  if (val == null_val) {
1289  return 0;
1290  }
1291 
1292  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
1293  return 0;
1294  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
1295  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
1296  return 0;
1297  } else {
1298  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1299  return 15;
1300  }
1301 }
1302 
1303 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg,
1304  const float val) {
1305  return (*agg)++;
1306 }
1307 
1308 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t* agg,
1309  const float val) {
1310  const auto r = *reinterpret_cast<const float*>(agg) + val;
1311  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
1312 }
1313 
1314 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t* agg,
1315  const float val) {
1316  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
1317  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
1318 }
1319 
1320 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t* agg,
1321  const float val) {
1322  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
1323  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
1324 }
1325 
1326 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
1327  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
1328 }
1329 
1330 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1331 checked_single_agg_id_float(int32_t* agg, const float val, const float null_val) {
1332  if (val == null_val) {
1333  return 0;
1334  }
1335 
1336  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
1337  return 0;
1338  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
1339  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
1340  return 0;
1341  } else {
1342  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
1343  return 15;
1344  }
1345 }
1346 
1347 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
1348 agg_count_double_skip_val(uint64_t* agg, const double val, const double skip_val) {
1349  if (val != skip_val) {
1350  return agg_count_double(agg, val);
1351  }
1352  return *agg;
1353 }
1354 
1355 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
1356 agg_count_float_skip_val(uint32_t* agg, const float val, const float skip_val) {
1357  if (val != skip_val) {
1358  return agg_count_float(agg, val);
1359  }
1360  return *agg;
1361 }
1362 
1363 #define DEF_SKIP_AGG_ADD(base_agg_func) \
1364  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1365  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
1366  if (val != skip_val) { \
1367  base_agg_func(agg, val); \
1368  } \
1369  }
1370 
1371 #define DEF_SKIP_AGG(base_agg_func) \
1372  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
1373  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
1374  if (val != skip_val) { \
1375  const ADDR_T old_agg = *agg; \
1376  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
1377  base_agg_func(agg, val); \
1378  } else { \
1379  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
1380  } \
1381  } \
1382  }
1383 
1384 #define DATA_T double
1385 #define ADDR_T int64_t
1389 #undef ADDR_T
1390 #undef DATA_T
1391 
1392 #define DATA_T float
1393 #define ADDR_T int32_t
1397 #undef ADDR_T
1398 #undef DATA_T
1399 
1400 #undef DEF_SKIP_AGG_ADD
1401 #undef DEF_SKIP_AGG
1402 
1403 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x,
1404  const int64_t scale) {
1405  if (x >= 0) {
1406  return x / scale * scale;
1407  }
1408  if (!(x % scale)) {
1409  return x;
1410  }
1411  return x / scale * scale - scale;
1412 }
1413 
1414 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x,
1415  const int64_t scale) {
1416  return decimal_floor(x, scale) + (x % scale ? scale : 0);
1417 }
1418 
1419 // Shared memory aggregators. Should never be called,
1420 // real implementations are in cuda_mapd_rt.cu.
1421 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
1422  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
1423  const int64_t val) { \
1424  return 0; \
1425  } \
1426  \
1427  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
1428  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
1429  return 0; \
1430  } \
1431  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
1432  const int32_t val) { \
1433  return 0; \
1434  } \
1435  \
1436  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
1437  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
1438  return 0; \
1439  } \
1440  \
1441  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
1442  const double val) { \
1443  return 0; \
1444  } \
1445  \
1446  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
1447  uint64_t* agg, const double val, const double skip_val) { \
1448  return 0; \
1449  } \
1450  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
1451  const float val) { \
1452  return 0; \
1453  } \
1454  \
1455  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
1456  uint32_t* agg, const float val, const float skip_val) { \
1457  return 0; \
1458  }
1459 
1460 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
1461  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
1462  \
1463  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
1464  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
1465  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
1466  const int32_t val) {} \
1467  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
1468  const int16_t val) {} \
1469  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
1470  const int8_t val) {} \
1471  \
1472  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
1473  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
1474  \
1475  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
1476  const double val) {} \
1477  \
1478  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
1479  int64_t* agg, const double val, const double skip_val) {} \
1480  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
1481  const float val) {} \
1482  \
1483  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
1484  int32_t* agg, const float val, const float skip_val) {}
1485 
1490 
1491 extern "C" GPU_RT_STUB int8_t* agg_id_varlen_shared(int8_t* varlen_buffer,
1492  const int64_t offset,
1493  const int8_t* value,
1494  const int64_t size_bytes) {
1495  return nullptr;
1496 }
1497 
1498 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
1499  const int64_t val,
1500  const int64_t null_val) {
1501  return 0;
1502 }
1503 
1504 extern "C" GPU_RT_STUB int32_t
1506  const int32_t val,
1507  const int32_t null_val) {
1508  return 0;
1509 }
1510 extern "C" GPU_RT_STUB int32_t
1512  const int16_t val,
1513  const int16_t null_val) {
1514  return 0;
1515 }
1516 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
1517  const int8_t val,
1518  const int8_t null_val) {
1519  return 0;
1520 }
1521 
1522 extern "C" GPU_RT_STUB int32_t
1524  const double val,
1525  const double null_val) {
1526  return 0;
1527 }
1528 
1529 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
1530  const float val,
1531  const float null_val) {
1532  return 0;
1533 }
1534 
1535 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
1536  const int16_t val,
1537  const int16_t skip_val) {}
1538 
1539 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
1540  const int8_t val,
1541  const int8_t skip_val) {}
1542 
1543 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
1544  const int16_t val,
1545  const int16_t skip_val) {}
1546 
1547 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
1548  const int8_t val,
1549  const int8_t skip_val) {}
1550 
1551 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
1552 
1553 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
1554  return 0;
1555 }
1556 
1557 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
1558  const int64_t val,
1559  const int64_t skip_val) {
1560  return 0;
1561 }
1562 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
1563  return 0;
1564 }
1565 
1566 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
1567  const int32_t val,
1568  const int32_t skip_val) {
1569  return 0;
1570 }
1571 
1572 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
1573 
1574 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
1575  const double val,
1576  const double skip_val) {}
1577 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
1578 
1579 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
1580  const float val,
1581  const float skip_val) {}
1582 
1583 extern "C" GPU_RT_STUB void force_sync() {}
1584 
1585 extern "C" GPU_RT_STUB void sync_warp() {}
1586 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
1587 extern "C" GPU_RT_STUB void sync_threadblock() {}
1588 
1589 extern "C" GPU_RT_STUB void write_back_non_grouped_agg(int64_t* input_buffer,
1590  int64_t* output_buffer,
1591  const int32_t num_agg_cols){};
1592 // x64 stride functions
1593 
1594 extern "C" RUNTIME_EXPORT NEVER_INLINE int32_t pos_start_impl(int32_t* error_code) {
1595  int32_t row_index_resume{0};
1596  if (error_code) {
1597  row_index_resume = error_code[0];
1598  error_code[0] = 0;
1599  }
1600  return row_index_resume;
1601 }
1602 
1604  return pos_start_impl(nullptr);
1605 }
1606 
1608  return 1;
1609 }
1610 
1611 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
1612  return 0;
1613 }
1614 
1615 extern "C" GPU_RT_STUB int64_t get_thread_index() {
1616  return 0;
1617 }
1618 
1620  return nullptr;
1621 }
1622 
1623 extern "C" GPU_RT_STUB int64_t get_block_index() {
1624  return 0;
1625 }
1626 
1627 #undef GPU_RT_STUB
1628 
1629 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code,
1630  int32_t* error_codes) {
1631  // NB: never override persistent error codes (with code greater than zero).
1632  // On GPU, a projection query with a limit can run out of slots without it
1633  // being an actual error if the limit has been hit. If a persistent error
1634  // (division by zero, for example) occurs before running out of slots, we
1635  // have to avoid overriding it, because there's a risk that the query would
1636  // go through if we override with a potentially benign out-of-slots code.
1637  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
1638  error_codes[pos_start_impl(nullptr)] = err_code;
1639  }
1640 }
1641 
1642 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t* error_codes) {
1643  return error_codes[pos_start_impl(nullptr)];
1644 }
1645 
1646 // group by helpers
1647 
1649  const int64_t* groups_buffer,
1650  const int32_t groups_buffer_size) {
1651  return groups_buffer;
1652 }
1653 
1655  int64_t* src,
1656  const int32_t sz) {
1657 #ifndef _WIN32
1658  // the body is not really needed, just make sure the call is not optimized away
1659  assert(dest);
1660 #endif
1661 }
1662 
1663 extern "C" RUNTIME_EXPORT int64_t* init_shared_mem(const int64_t* global_groups_buffer,
1664  const int32_t groups_buffer_size) {
1665  return nullptr;
1666 }
1667 
1669  int64_t* groups_buffer,
1670  const int64_t* init_vals,
1671  const uint32_t groups_buffer_entry_count,
1672  const uint32_t key_qw_count,
1673  const uint32_t agg_col_count,
1674  const bool keyless,
1675  const int8_t warp_size) {
1676 #ifndef _WIN32
1677  // the body is not really needed, just make sure the call is not optimized away
1678  assert(groups_buffer);
1679 #endif
1680 }
1681 
1683  int64_t* groups_buffer,
1684  const int64_t* init_vals,
1685  const uint32_t groups_buffer_entry_count,
1686  const uint32_t key_qw_count,
1687  const uint32_t agg_col_count,
1688  const bool keyless,
1689  const bool blocks_share_memory,
1690  const int32_t frag_idx) {
1691 #ifndef _WIN32
1692  // the body is not really needed, just make sure the call is not optimized away
1693  assert(groups_buffer);
1694 #endif
1695 }
1696 
1698  int64_t* groups_buffer,
1699  const int64_t* init_vals,
1700  const uint32_t groups_buffer_entry_count,
1701  const uint32_t key_qw_count,
1702  const uint32_t agg_col_count,
1703  const bool keyless,
1704  const int8_t warp_size) {
1705 #ifndef _WIN32
1706  // the body is not really needed, just make sure the call is not optimized away
1707  assert(groups_buffer);
1708 #endif
1709 }
1710 
1711 template <typename T>
1712 ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
1713  const uint32_t h,
1714  const T* key,
1715  const uint32_t key_count,
1716  const uint32_t row_size_quad) {
1717  auto off = h * row_size_quad;
1718  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1719  if (*row_ptr == get_empty_key<T>()) {
1720  memcpy(row_ptr, key, key_count * sizeof(T));
1721  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1722  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1723  }
1724  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1725  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1726  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1727  }
1728  return nullptr;
1729 }
1730 
1732  int64_t* groups_buffer,
1733  const uint32_t h,
1734  const int64_t* key,
1735  const uint32_t key_count,
1736  const uint32_t key_width,
1737  const uint32_t row_size_quad) {
1738  switch (key_width) {
1739  case 4:
1740  return get_matching_group_value(groups_buffer,
1741  h,
1742  reinterpret_cast<const int32_t*>(key),
1743  key_count,
1744  row_size_quad);
1745  case 8:
1746  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1747  default:;
1748  }
1749  return nullptr;
1750 }
1751 
1752 template <typename T>
1754  const uint32_t entry_count,
1755  const uint32_t h,
1756  const T* key,
1757  const uint32_t key_count) {
1758  auto off = h;
1759  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1760  if (key_buffer[off] == get_empty_key<T>()) {
1761  for (size_t i = 0; i < key_count; ++i) {
1762  key_buffer[off] = key[i];
1763  off += entry_count;
1764  }
1765  return h;
1766  }
1767  off = h;
1768  for (size_t i = 0; i < key_count; ++i) {
1769  if (key_buffer[off] != key[i]) {
1770  return -1;
1771  }
1772  off += entry_count;
1773  }
1774  return h;
1775 }
1776 
1777 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1779  const uint32_t entry_count,
1780  const uint32_t h,
1781  const int64_t* key,
1782  const uint32_t key_count,
1783  const uint32_t key_width) {
1784  switch (key_width) {
1785  case 4:
1786  return get_matching_group_value_columnar_slot(groups_buffer,
1787  entry_count,
1788  h,
1789  reinterpret_cast<const int32_t*>(key),
1790  key_count);
1791  case 8:
1793  groups_buffer, entry_count, h, key, key_count);
1794  default:
1795  return -1;
1796  }
1797  return -1;
1798 }
1799 
1801  int64_t* groups_buffer,
1802  const uint32_t h,
1803  const int64_t* key,
1804  const uint32_t key_qw_count,
1805  const size_t entry_count) {
1806  auto off = h;
1807  if (groups_buffer[off] == EMPTY_KEY_64) {
1808  for (size_t i = 0; i < key_qw_count; ++i) {
1809  groups_buffer[off] = key[i];
1810  off += entry_count;
1811  }
1812  return &groups_buffer[off];
1813  }
1814  off = h;
1815  for (size_t i = 0; i < key_qw_count; ++i) {
1816  if (groups_buffer[off] != key[i]) {
1817  return nullptr;
1818  }
1819  off += entry_count;
1820  }
1821  return &groups_buffer[off];
1822 }
1823 
1824 /*
1825  * For a particular hashed_index, returns the row-wise offset
1826  * to the first matching agg column in memory.
1827  * It also checks the corresponding group column, and initialize all
1828  * available keys if they are not empty (it is assumed all group columns are
1829  * 64-bit wide).
1830  *
1831  * Memory layout:
1832  *
1833  * | prepended group columns (64-bit each) | agg columns |
1834  */
1836  int64_t* groups_buffer,
1837  const uint32_t hashed_index,
1838  const int64_t* key,
1839  const uint32_t key_count,
1840  const uint32_t row_size_quad) {
1841  uint32_t off = hashed_index * row_size_quad;
1842  if (groups_buffer[off] == EMPTY_KEY_64) {
1843  for (uint32_t i = 0; i < key_count; ++i) {
1844  groups_buffer[off + i] = key[i];
1845  }
1846  }
1847  return groups_buffer + off + key_count;
1848 }
1849 
1856 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t*
1858  const uint32_t hashed_index,
1859  const uint32_t row_size_quad) {
1860  return groups_buffer + row_size_quad * hashed_index;
1861 }
1862 
1863 /*
1864  * For a particular hashed_index, find and initialize (if necessary) all the group
1865  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1866  */
1867 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void
1869  const uint32_t hashed_index,
1870  const int64_t* key,
1871  const uint32_t key_count,
1872  const uint32_t entry_count) {
1873  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1874  for (uint32_t i = 0; i < key_count; i++) {
1875  groups_buffer[i * entry_count + hashed_index] = key[i];
1876  }
1877  }
1878 }
1879 
1880 #include "GeoOpsRuntime.cpp"
1881 #include "GroupByRuntime.cpp"
1883 
1885  int64_t* groups_buffer,
1886  const int64_t key,
1887  const int64_t min_key,
1888  const int64_t /* bucket */,
1889  const uint32_t row_size_quad) {
1890  return groups_buffer + row_size_quad * (key - min_key);
1891 }
1892 
1894  int64_t* groups_buffer,
1895  const int64_t key,
1896  const int64_t min_key,
1897  const int64_t /* bucket */,
1898  const uint32_t row_size_quad,
1899  const uint8_t thread_warp_idx,
1900  const uint8_t warp_size) {
1901  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1902 }
1903 
1905  const uint64_t str_and_len) {
1906  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1907 }
1908 
1909 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1910 extract_str_len(const uint64_t str_and_len) {
1911  return static_cast<int64_t>(str_and_len) >> 48;
1912 }
1913 
1915  const uint64_t str_and_len) {
1916  return extract_str_ptr(str_and_len);
1917 }
1918 
1919 extern "C" RUNTIME_EXPORT NEVER_INLINE int32_t
1920 extract_str_len_noinline(const uint64_t str_and_len) {
1921  return extract_str_len(str_and_len);
1922 }
1923 
1924 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr,
1925  const int32_t len) {
1926  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1927  (static_cast<const uint64_t>(len) << 48);
1928 }
1929 
1930 #ifdef __clang__
1931 #include "../Utils/StringLike.cpp"
1932 #endif
1933 
1934 #ifndef __CUDACC__
1935 #include "TopKRuntime.cpp"
1936 #endif
1937 
1938 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1939 char_length(const char* str, const int32_t str_len) {
1940  return str_len;
1941 }
1942 
1943 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1944 char_length_nullable(const char* str, const int32_t str_len, const int32_t int_null) {
1945  if (!str) {
1946  return int_null;
1947  }
1948  return str_len;
1949 }
1950 
1951 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1952 key_for_string_encoded(const int32_t str_id) {
1953  return str_id;
1954 }
1955 
1956 extern "C" ALWAYS_INLINE DEVICE int32_t
1957 map_string_dict_id(const int32_t string_id,
1958  const int64_t translation_map_handle,
1959  const int32_t min_source_id) {
1960  const int32_t* translation_map =
1961  reinterpret_cast<const int32_t*>(translation_map_handle);
1962  return translation_map[string_id - min_source_id];
1963 }
1964 
1966  const double proportion,
1967  const int64_t row_offset) {
1968  const int64_t threshold = 4294967296 * proportion;
1969  return (row_offset * 2654435761) % 4294967296 < threshold;
1970 }
1971 
1972 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1973 width_bucket(const double target_value,
1974  const double lower_bound,
1975  const double upper_bound,
1976  const double scale_factor,
1977  const int32_t partition_count) {
1978  if (target_value < lower_bound) {
1979  return 0;
1980  } else if (target_value >= upper_bound) {
1981  return partition_count + 1;
1982  }
1983  return ((target_value - lower_bound) * scale_factor) + 1;
1984 }
1985 
1986 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1987 width_bucket_reversed(const double target_value,
1988  const double lower_bound,
1989  const double upper_bound,
1990  const double scale_factor,
1991  const int32_t partition_count) {
1992  if (target_value > lower_bound) {
1993  return 0;
1994  } else if (target_value <= upper_bound) {
1995  return partition_count + 1;
1996  }
1997  return ((lower_bound - target_value) * scale_factor) + 1;
1998 }
1999 
2000 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
2001 width_bucket_nullable(const double target_value,
2002  const double lower_bound,
2003  const double upper_bound,
2004  const double scale_factor,
2005  const int32_t partition_count,
2006  const double null_val) {
2007  if (target_value == null_val) {
2008  return INT32_MIN;
2009  }
2010  return width_bucket(
2011  target_value, lower_bound, upper_bound, scale_factor, partition_count);
2012 }
2013 
2014 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
2015 width_bucket_reversed_nullable(const double target_value,
2016  const double lower_bound,
2017  const double upper_bound,
2018  const double scale_factor,
2019  const int32_t partition_count,
2020  const double null_val) {
2021  if (target_value == null_val) {
2022  return INT32_MIN;
2023  }
2024  return width_bucket_reversed(
2025  target_value, lower_bound, upper_bound, scale_factor, partition_count);
2026 }
2027 
2028 // width_bucket with no out-of-bound check version which can be called
2029 // if we can assure the input target_value expr always resides in the valid range
2030 // (so we can also avoid null checking)
2031 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2032 width_bucket_no_oob_check(const double target_value,
2033  const double lower_bound,
2034  const double scale_factor) {
2035  return ((target_value - lower_bound) * scale_factor) + 1;
2036 }
2037 
2038 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2039 width_bucket_reversed_no_oob_check(const double target_value,
2040  const double lower_bound,
2041  const double scale_factor) {
2042  return ((lower_bound - target_value) * scale_factor) + 1;
2043 }
2044 
2045 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2046 width_bucket_expr(const double target_value,
2047  const bool reversed,
2048  const double lower_bound,
2049  const double upper_bound,
2050  const int32_t partition_count) {
2051  if (reversed) {
2052  return width_bucket_reversed(target_value,
2053  lower_bound,
2054  upper_bound,
2055  partition_count / (lower_bound - upper_bound),
2056  partition_count);
2057  }
2058  return width_bucket(target_value,
2059  lower_bound,
2060  upper_bound,
2061  partition_count / (upper_bound - lower_bound),
2062  partition_count);
2063 }
2064 
2065 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2066 width_bucket_expr_nullable(const double target_value,
2067  const bool reversed,
2068  const double lower_bound,
2069  const double upper_bound,
2070  const int32_t partition_count,
2071  const double null_val) {
2072  if (target_value == null_val) {
2073  return INT32_MIN;
2074  }
2075  return width_bucket_expr(
2076  target_value, reversed, lower_bound, upper_bound, partition_count);
2077 }
2078 
2079 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
2080 width_bucket_expr_no_oob_check(const double target_value,
2081  const bool reversed,
2082  const double lower_bound,
2083  const double upper_bound,
2084  const int32_t partition_count) {
2085  if (reversed) {
2087  target_value, lower_bound, partition_count / (lower_bound - upper_bound));
2088  }
2090  target_value, lower_bound, partition_count / (upper_bound - lower_bound));
2091 }
2092 
2093 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
2094 row_number_window_func(const int64_t output_buff, const int64_t pos) {
2095  return reinterpret_cast<const int64_t*>(output_buff)[pos];
2096 }
2097 
2099  const int64_t output_buff,
2100  const int64_t pos) {
2101  return reinterpret_cast<const double*>(output_buff)[pos];
2102 }
2103 
2104 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t* agg) {
2105  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
2106 }
2107 
2108 extern "C" RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t* agg) {
2109  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
2110 }
2111 
2112 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t* sum,
2113  const int64_t* count,
2114  const double null_val) {
2115  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
2116 }
2117 
2118 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
2119  const int64_t* count,
2120  const double null_val,
2121  const uint32_t scale) {
2122  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
2123 }
2124 
2125 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t* agg,
2126  const int64_t* count,
2127  const double null_val) {
2128  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
2129  : null_val;
2130 }
2131 
2132 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t* agg,
2133  const int32_t* count,
2134  const double null_val) {
2135  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
2136  : null_val;
2137 }
2138 
2140  uint8_t* bitmap,
2141  const uint32_t bitmap_bytes,
2142  const uint8_t* key_bytes,
2143  const uint32_t key_len) {
2144  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
2145  const uint32_t word_idx = bit_pos / 32;
2146  const uint32_t bit_idx = bit_pos % 32;
2147  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
2148 }
2149 
2151  const int8_t** col_buffers,
2152  const int8_t* literals,
2153  const int64_t* num_rows,
2154  const uint64_t* frag_row_offsets,
2155  const int32_t* max_matched,
2156  const int64_t* init_agg_value,
2157  int64_t** out,
2158  uint32_t frag_idx,
2159  const int64_t* join_hash_tables,
2160  int32_t* error_code,
2161  int32_t* total_matched) {
2162 #ifndef _WIN32
2163  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
2164  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
2165  total_matched);
2166 #endif
2167 }
2168 
2170  const int8_t*** col_buffers,
2171  const uint64_t* num_fragments,
2172  const int8_t* literals,
2173  const int64_t* num_rows,
2174  const uint64_t* frag_row_offsets,
2175  const int32_t* max_matched,
2176  int32_t* total_matched,
2177  const int64_t* init_agg_value,
2178  int64_t** out,
2179  int32_t* error_code,
2180  const uint32_t* num_tables_ptr,
2181  const int64_t* join_hash_tables) {
2182  for (uint32_t i = 0; i < *num_fragments; ++i) {
2183  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
2184  literals,
2185  &num_rows[i * (*num_tables_ptr)],
2186  &frag_row_offsets[i * (*num_tables_ptr)],
2187  max_matched,
2188  init_agg_value,
2189  out,
2190  i,
2191  join_hash_tables,
2192  total_matched,
2193  error_code);
2194  }
2195 }
2196 
2197 extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(const int8_t** col_buffers,
2198  const int64_t* num_rows,
2199  const uint64_t* frag_row_offsets,
2200  const int32_t* max_matched,
2201  const int64_t* init_agg_value,
2202  int64_t** out,
2203  uint32_t frag_idx,
2204  const int64_t* join_hash_tables,
2205  int32_t* error_code,
2206  int32_t* total_matched) {
2207 #ifndef _WIN32
2208  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
2209  out || frag_idx || error_code || join_hash_tables || total_matched);
2210 #endif
2211 }
2212 
2213 extern "C" RUNTIME_EXPORT void multifrag_query(const int8_t*** col_buffers,
2214  const uint64_t* num_fragments,
2215  const int64_t* num_rows,
2216  const uint64_t* frag_row_offsets,
2217  const int32_t* max_matched,
2218  int32_t* total_matched,
2219  const int64_t* init_agg_value,
2220  int64_t** out,
2221  int32_t* error_code,
2222  const uint32_t* num_tables_ptr,
2223  const int64_t* join_hash_tables) {
2224  for (uint32_t i = 0; i < *num_fragments; ++i) {
2225  query_stub(col_buffers ? col_buffers[i] : nullptr,
2226  &num_rows[i * (*num_tables_ptr)],
2227  &frag_row_offsets[i * (*num_tables_ptr)],
2228  max_matched,
2229  init_agg_value,
2230  out,
2231  i,
2232  join_hash_tables,
2233  total_matched,
2234  error_code);
2235  }
2236 }
2237 
2239  if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
2240  return true;
2241  }
2242  return false;
2243 }
2244 
2245 extern "C" RUNTIME_EXPORT bool check_interrupt_init(unsigned command) {
2246  static std::atomic_bool runtime_interrupt_flag{false};
2247 
2248  if (command == static_cast<unsigned>(INT_CHECK)) {
2249  if (runtime_interrupt_flag.load()) {
2250  return true;
2251  }
2252  return false;
2253  }
2254  if (command == static_cast<unsigned>(INT_ABORT)) {
2255  runtime_interrupt_flag.store(true);
2256  return false;
2257  }
2258  if (command == static_cast<unsigned>(INT_RESET)) {
2259  runtime_interrupt_flag.store(false);
2260  return false;
2261  }
2262  return false;
2263 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
RUNTIME_EXPORT void agg_min_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
#define DEF_UMINUS_NULLABLE(type, null_type)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE int8_t * agg_id_varlen(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT void agg_max_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
#define EMPTY_KEY_64
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:49
RUNTIME_EXPORT void agg_min_int16(int16_t *agg, const int16_t val)
#define GPU_RT_STUB
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
RUNTIME_EXPORT void agg_max_int16(int16_t *agg, const int16_t val)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:39
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
#define DEF_CAST_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ int64_t get_thread_index()
Definition: cuda_mapd_rt.cu:19
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
RUNTIME_EXPORT ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:35
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
Calculate approximate median and general quantiles, based on &quot;Computing Extremely Accurate Quantiles ...
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
Structures and runtime functions of streaming top-k heap.
#define DEF_RANGE_MODE_FRAME_LOWER_BOUND(value_type, opname, opsym)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:43
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
#define DEF_AGG_MAX_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_integer_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
Macros and functions for groupby buffer compaction.
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_CAST_SCALED_NULLABLE(from_type, to_type)
#define DEF_COMPUTE_FRAME_MAX(agg_value_type)
RUNTIME_EXPORT NEVER_INLINE void query_stub(const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ int8_t * agg_id_varlen_shared(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
__device__ int64_t * declare_dynamic_shared_memory()
Definition: cuda_mapd_rt.cu:56
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
ALWAYS_INLINE DEVICE int32_t map_string_dict_id(const int32_t string_id, const int64_t translation_map_handle, const int32_t min_source_id)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
#define DEF_ROUND_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
#define DEVICE
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
Definition: cuda_mapd_rt.cu:23
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
RUNTIME_EXPORT NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_start_index_add(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound, int64_t num_current_partition_elem)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
#define DEF_COMPUTE_FRAME_SUM(agg_value_type)
#define DEF_SEARCH_DERIVED_AGGREGATION_TREE(agg_value_type)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_start_pos(const int64_t null_start_pos, const int64_t null_end_pos)
#define DEF_AGG_MIN_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
#define DEF_RANGE_MODE_FRAME_UPPER_BOUND(value_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t *agg)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_COMPUTE_LOWER_BOUND_FROM_ORDERED_INDEX(value_type)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
RUNTIME_EXPORT ALWAYS_INLINE double * get_double_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
#define RUNTIME_EXPORT
RUNTIME_EXPORT ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
#define DEF_COMPUTE_FRAME_MIN(agg_value_type)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t get_valid_buf_end_pos(const int64_t num_elems, const int64_t null_start_pos, const int64_t null_end_pos)
RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t *agg)
RUNTIME_EXPORT NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_no_oob_check(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
#define DEF_SEARCH_AGGREGATION_TREE(agg_value_type)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
RUNTIME_EXPORT void agg_max_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE size_t getStartOffsetForSegmentTreeTraversal(size_t level, size_t tree_fanout)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
RUNTIME_EXPORT void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:66
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< double > * get_double_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
RUNTIME_EXPORT ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
#define NEVER_INLINE
RUNTIME_EXPORT ALWAYS_INLINE int64_t compute_row_mode_end_index_sub(int64_t candidate_index, int64_t current_partition_start_offset, int64_t frame_bound)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr_nullable(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count, const double null_val)
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
#define DEF_COMPUTE_UPPER_BOUND(value_type)
RUNTIME_EXPORT ALWAYS_INLINE int32_t width_bucket_reversed_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void sync_threadblock()
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
RUNTIME_EXPORT void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
#define DEF_COMPUTE_SUM_AND_COUNT_FRAME_AGGREGATES(agg_value_type)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
#define ALWAYS_INLINE
#define DEF_HANDLE_NULL_FOR_WINDOW_FRAMING_AGG(agg_type, null_type)
#define DEF_AGG_ID_INT(n)
#define DEF_COMPUTE_UPPER_BOUND_FROM_ORDERED_INDEX(value_type)
RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
#define DEF_COMPUTE_LOWER_BOUND(value_type)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
RUNTIME_EXPORT ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
__device__ void force_sync()
RUNTIME_EXPORT void agg_min_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
RUNTIME_EXPORT ALWAYS_INLINE SumAndCountPair< int64_t > * get_integer_derived_aggregation_tree(int64_t **aggregation_trees, size_t partition_idx)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_reversed_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t width_bucket_expr(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:31