OmniSciDB  5ade3759e0
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "TypePunning.h"
27 
28 #include <algorithm>
29 #include <chrono>
30 #include <cmath>
31 #include <cstring>
32 #include <thread>
33 #include <tuple>
34 
35 // decoder implementations
36 
37 #include "DecodersImpl.h"
38 
39 // arithmetic operator implementations
40 
41 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
42  extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
43  const type lhs, const type rhs, const null_type null_val) { \
44  if (lhs != null_val && rhs != null_val) { \
45  return lhs opsym rhs; \
46  } \
47  return null_val; \
48  }
49 
50 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
51  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
52  const type lhs, const type rhs, const null_type null_val) { \
53  if (lhs != null_val) { \
54  return lhs opsym rhs; \
55  } \
56  return null_val; \
57  }
58 
59 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
60  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
61  const type lhs, const type rhs, const null_type null_val) { \
62  if (rhs != null_val) { \
63  return lhs opsym rhs; \
64  } \
65  return null_val; \
66  }
67 
68 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
69  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
70  const type lhs, \
71  const type rhs, \
72  const null_type null_val, \
73  const int8_t null_bool_val) { \
74  if (lhs != null_val && rhs != null_val) { \
75  return lhs opsym rhs; \
76  } \
77  return null_bool_val; \
78  }
79 
80 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
81  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
82  const type lhs, \
83  const type rhs, \
84  const null_type null_val, \
85  const int8_t null_bool_val) { \
86  if (lhs != null_val) { \
87  return lhs opsym rhs; \
88  } \
89  return null_bool_val; \
90  }
91 
92 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
93  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
94  const type lhs, \
95  const type rhs, \
96  const null_type null_val, \
97  const int8_t null_bool_val) { \
98  if (rhs != null_val) { \
99  return lhs opsym rhs; \
100  } \
101  return null_bool_val; \
102  }
103 
104 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
105  extern "C" ALWAYS_INLINE type safe_div_##type( \
106  const type lhs, const type rhs, const null_type null_val) { \
107  if (lhs != null_val && rhs != null_val && rhs != 0) { \
108  return lhs / rhs; \
109  } \
110  return null_val; \
111  }
112 
113 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
114  DEF_ARITH_NULLABLE(type, null_type, add, +) \
115  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
116  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
117  DEF_ARITH_NULLABLE(type, null_type, div, /) \
118  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
119  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
120  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
121  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
123  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
124  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
125  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
127  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
128  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
129  DEF_CMP_NULLABLE(type, null_type, lt, <) \
130  DEF_CMP_NULLABLE(type, null_type, gt, >) \
131  DEF_CMP_NULLABLE(type, null_type, le, <=) \
132  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
133  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
134  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
135  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
139  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
140  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
141  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
145 
146 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
147 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
148 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
149 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
150 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
151 DEF_CMP_NULLABLE(int8_t, int64_t, eq, ==)
152 DEF_CMP_NULLABLE(int8_t, int64_t, ne, !=)
153 DEF_CMP_NULLABLE_LHS(int8_t, int64_t, eq, ==)
154 DEF_CMP_NULLABLE_LHS(int8_t, int64_t, ne, !=)
155 DEF_CMP_NULLABLE_RHS(int8_t, int64_t, eq, ==)
156 DEF_CMP_NULLABLE_RHS(int8_t, int64_t, ne, !=)
157 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
165 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
166 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
167 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
168 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
169 
170 #undef DEF_BINARY_NULLABLE_ALL_OPS
171 #undef DEF_SAFE_DIV_NULLABLE
172 #undef DEF_CMP_NULLABLE_RHS
173 #undef DEF_CMP_NULLABLE_LHS
174 #undef DEF_CMP_NULLABLE
175 #undef DEF_ARITH_NULLABLE_RHS
176 #undef DEF_ARITH_NULLABLE_LHS
177 #undef DEF_ARITH_NULLABLE
178 
179 extern "C" ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand,
180  const uint64_t scale,
181  const int64_t operand_null_val,
182  const int64_t result_null_val) {
183  return operand != operand_null_val ? operand * scale : result_null_val;
184 }
185 
186 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand,
187  const int64_t scale,
188  const int64_t null_val) {
189  // rounded scale down of a decimal
190  if (operand == null_val) {
191  return null_val;
192  }
193 
194  int64_t tmp = scale >> 1;
195  tmp = operand >= 0 ? operand + tmp : operand - tmp;
196  return tmp / scale;
197 }
198 
199 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand,
200  const int64_t scale,
201  const int64_t null_val) {
202  int64_t tmp = scale >> 1;
203  tmp = operand >= 0 ? operand + tmp : operand - tmp;
204  return tmp / scale;
205 }
206 
207 #define DEF_UMINUS_NULLABLE(type, null_type) \
208  extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
209  const null_type null_val) { \
210  return operand == null_val ? null_val : -operand; \
211  }
212 
213 DEF_UMINUS_NULLABLE(int16_t, int16_t)
214 DEF_UMINUS_NULLABLE(int32_t, int32_t)
215 DEF_UMINUS_NULLABLE(int64_t, int64_t)
216 DEF_UMINUS_NULLABLE(float, float)
217 DEF_UMINUS_NULLABLE(double, double)
218 
219 #undef DEF_UMINUS_NULLABLE
220 
221 #define DEF_CAST_NULLABLE(from_type, to_type) \
222  extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
223  const from_type operand, \
224  const from_type from_null_val, \
225  const to_type to_null_val) { \
226  return operand == from_null_val ? to_null_val : operand; \
227  }
228 
229 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
230  DEF_CAST_NULLABLE(type1, type2) \
231  DEF_CAST_NULLABLE(type2, type1)
232 
233 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
234 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
235 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
236 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
237 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
238 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
239 DEF_CAST_NULLABLE_BIDIR(float, double)
240 DEF_CAST_NULLABLE_BIDIR(float, int8_t)
241 DEF_CAST_NULLABLE_BIDIR(float, int16_t)
242 DEF_CAST_NULLABLE_BIDIR(float, int32_t)
243 DEF_CAST_NULLABLE_BIDIR(float, int64_t)
244 DEF_CAST_NULLABLE_BIDIR(double, int8_t)
245 DEF_CAST_NULLABLE_BIDIR(double, int16_t)
246 DEF_CAST_NULLABLE_BIDIR(double, int32_t)
247 DEF_CAST_NULLABLE_BIDIR(double, int64_t)
248 DEF_CAST_NULLABLE(uint8_t, int32_t)
249 DEF_CAST_NULLABLE(uint16_t, int32_t)
250 
251 #undef DEF_CAST_NULLABLE_BIDIR
252 #undef DEF_CAST_NULLABLE
253 
254 extern "C" ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val) {
255  return operand == null_val ? operand : (operand ? 0 : 1);
256 }
257 
258 extern "C" ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
259  const int8_t rhs,
260  const int8_t null_val) {
261  if (lhs == null_val) {
262  return rhs == 0 ? rhs : null_val;
263  }
264  if (rhs == null_val) {
265  return lhs == 0 ? lhs : null_val;
266  }
267  return (lhs && rhs) ? 1 : 0;
268 }
269 
270 extern "C" ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
271  const int8_t rhs,
272  const int8_t null_val) {
273  if (lhs == null_val) {
274  return rhs == 0 ? null_val : rhs;
275  }
276  if (rhs == null_val) {
277  return lhs == 0 ? null_val : lhs;
278  }
279  return (lhs || rhs) ? 1 : 0;
280 }
281 
282 // aggregator implementations
283 
284 extern "C" ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
285  return (*agg)++;
286 }
287 
288 extern "C" ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t* agg,
289  const int64_t val,
290  const int64_t min_val) {
291  const uint64_t bitmap_idx = val - min_val;
292  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
293 }
294 
295 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
296 
298  const int64_t,
299  const int64_t,
300  const int64_t,
301  const int64_t,
302  const uint64_t,
303  const uint64_t) {}
304 
305 extern "C" NEVER_INLINE void agg_approximate_count_distinct(int64_t* agg,
306  const int64_t key,
307  const uint32_t b) {
308  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
309  const uint32_t index = hash >> (64 - b);
310  const uint8_t rank = get_rank(hash << b, 64 - b);
311  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
312  M[index] = std::max(M[index], rank);
313 }
314 
316  const int64_t,
317  const uint32_t,
318  const int64_t,
319  const int64_t) {}
320 
321 extern "C" ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
322  const int64_t val,
323  const int64_t min_val,
324  const int64_t max_val,
325  const int64_t null_val,
326  const int8_t null_bool_val) {
327  if (val == null_val) {
328  return null_bool_val;
329  }
330  if (val < min_val || val > max_val) {
331  return false;
332  }
333  const uint64_t bitmap_idx = val - min_val;
334  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
335  (1 << (bitmap_idx & 7))
336  ? 1
337  : 0;
338 }
339 
340 extern "C" ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
341  const auto old = *agg;
342  *agg += val;
343  return old;
344 }
345 
346 extern "C" ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
347  *agg = std::max(*agg, val);
348 }
349 
350 extern "C" ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
351  *agg = std::min(*agg, val);
352 }
353 
354 extern "C" ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
355  *agg = val;
356 }
357 
359  const int64_t val,
360  const int64_t min_val,
361  const int64_t skip_val) {
362  if (val != skip_val) {
363  agg_count_distinct_bitmap(agg, val, min_val);
364  }
365 }
366 
368  const int64_t,
369  const int64_t,
370  const int64_t,
371  const int64_t,
372  const int64_t,
373  const uint64_t,
374  const uint64_t) {}
375 
376 extern "C" ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg, const int32_t) {
377  return (*agg)++;
378 }
379 
380 extern "C" ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg, const int32_t val) {
381  const auto old = *agg;
382  *agg += val;
383  return old;
384 }
385 
386 #define DEF_AGG_MAX_INT(n) \
387  extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
388  *agg = std::max(*agg, val); \
389  }
390 
391 DEF_AGG_MAX_INT(32)
392 DEF_AGG_MAX_INT(16)
394 #undef DEF_AGG_MAX_INT
395 
396 #define DEF_AGG_MIN_INT(n) \
397  extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
398  *agg = std::min(*agg, val); \
399  }
400 
401 DEF_AGG_MIN_INT(32)
402 DEF_AGG_MIN_INT(16)
404 #undef DEF_AGG_MIN_INT
405 
406 #define DEF_AGG_ID_INT(n) \
407  extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
408  *agg = val; \
409  }
410 
411 DEF_AGG_ID_INT(32)
412 DEF_AGG_ID_INT(16)
414 #undef DEF_AGG_ID_INT
415 
416 #define DEF_WRITE_PROJECTION_INT(n) \
417  extern "C" ALWAYS_INLINE void write_projection_int##n( \
418  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
419  if (val != init_val) { \
420  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
421  } \
422  }
423 
426 #undef DEF_WRITE_PROJECTION_INT
427 
428 extern "C" ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
429  const int64_t val,
430  const int64_t skip_val) {
431  const auto old = *agg;
432  if (val != skip_val) {
433  if (old != skip_val) {
434  return agg_sum(agg, val);
435  } else {
436  *agg = val;
437  }
438  }
439  return old;
440 }
441 
442 extern "C" ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t* agg,
443  const int32_t val,
444  const int32_t skip_val) {
445  const auto old = *agg;
446  if (val != skip_val) {
447  if (old != skip_val) {
448  return agg_sum_int32(agg, val);
449  } else {
450  *agg = val;
451  }
452  }
453  return old;
454 }
455 
456 extern "C" ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t* agg,
457  const int64_t val,
458  const int64_t skip_val) {
459  if (val != skip_val) {
460  return agg_count(agg, val);
461  }
462  return *agg;
463 }
464 
465 extern "C" ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t* agg,
466  const int32_t val,
467  const int32_t skip_val) {
468  if (val != skip_val) {
469  return agg_count_int32(agg, val);
470  }
471  return *agg;
472 }
473 
474 #define DEF_SKIP_AGG_ADD(base_agg_func) \
475  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
476  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
477  if (val != skip_val) { \
478  base_agg_func(agg, val); \
479  } \
480  }
481 
482 #define DEF_SKIP_AGG(base_agg_func) \
483  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
484  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
485  if (val != skip_val) { \
486  const DATA_T old_agg = *agg; \
487  if (old_agg != skip_val) { \
488  base_agg_func(agg, val); \
489  } else { \
490  *agg = val; \
491  } \
492  } \
493  }
494 
495 #define DATA_T int64_t
498 #undef DATA_T
499 
500 #define DATA_T int32_t
503 #undef DATA_T
504 
505 #define DATA_T int16_t
508 #undef DATA_T
509 
510 #define DATA_T int8_t
513 #undef DATA_T
514 
515 #undef DEF_SKIP_AGG_ADD
516 #undef DEF_SKIP_AGG
517 
518 // TODO(alex): fix signature
519 
520 extern "C" ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg, const double val) {
521  return (*agg)++;
522 }
523 
524 extern "C" ALWAYS_INLINE void agg_sum_double(int64_t* agg, const double val) {
525  const auto r = *reinterpret_cast<const double*>(agg) + val;
526  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
527 }
528 
529 extern "C" ALWAYS_INLINE void agg_max_double(int64_t* agg, const double val) {
530  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
531  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
532 }
533 
534 extern "C" ALWAYS_INLINE void agg_min_double(int64_t* agg, const double val) {
535  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
536  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
537 }
538 
539 extern "C" ALWAYS_INLINE void agg_id_double(int64_t* agg, const double val) {
540  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
541 }
542 
543 extern "C" ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg, const float val) {
544  return (*agg)++;
545 }
546 
547 extern "C" ALWAYS_INLINE void agg_sum_float(int32_t* agg, const float val) {
548  const auto r = *reinterpret_cast<const float*>(agg) + val;
549  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
550 }
551 
552 extern "C" ALWAYS_INLINE void agg_max_float(int32_t* agg, const float val) {
553  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
554  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
555 }
556 
557 extern "C" ALWAYS_INLINE void agg_min_float(int32_t* agg, const float val) {
558  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
559  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
560 }
561 
562 extern "C" ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
563  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
564 }
565 
566 extern "C" ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t* agg,
567  const double val,
568  const double skip_val) {
569  if (val != skip_val) {
570  return agg_count_double(agg, val);
571  }
572  return *agg;
573 }
574 
575 extern "C" ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t* agg,
576  const float val,
577  const float skip_val) {
578  if (val != skip_val) {
579  return agg_count_float(agg, val);
580  }
581  return *agg;
582 }
583 
584 #define DEF_SKIP_AGG_ADD(base_agg_func) \
585  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
586  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
587  if (val != skip_val) { \
588  base_agg_func(agg, val); \
589  } \
590  }
591 
592 #define DEF_SKIP_AGG(base_agg_func) \
593  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
594  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
595  if (val != skip_val) { \
596  const ADDR_T old_agg = *agg; \
597  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
598  base_agg_func(agg, val); \
599  } else { \
600  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
601  } \
602  } \
603  }
604 
605 #define DATA_T double
606 #define ADDR_T int64_t
610 #undef ADDR_T
611 #undef DATA_T
612 
613 #define DATA_T float
614 #define ADDR_T int32_t
618 #undef ADDR_T
619 #undef DATA_T
620 
621 #undef DEF_SKIP_AGG_ADD
622 #undef DEF_SKIP_AGG
623 
624 extern "C" ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale) {
625  if (x >= 0) {
626  return x / scale * scale;
627  }
628  if (!(x % scale)) {
629  return x;
630  }
631  return x / scale * scale - scale;
632 }
633 
634 extern "C" ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale) {
635  return decimal_floor(x, scale) + (x % scale ? scale : 0);
636 }
637 
638 // Shared memory aggregators. Should never be called,
639 // real implementations are in cuda_mapd_rt.cu.
640 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
641  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
642  const int64_t val) { \
643  return 0; \
644  } \
645  \
646  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
647  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
648  return 0; \
649  } \
650  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
651  const int32_t val) { \
652  return 0; \
653  } \
654  \
655  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
656  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
657  return 0; \
658  } \
659  \
660  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
661  const double val) { \
662  return 0; \
663  } \
664  \
665  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
666  uint64_t* agg, const double val, const double skip_val) { \
667  return 0; \
668  } \
669  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
670  const float val) { \
671  return 0; \
672  } \
673  \
674  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
675  uint32_t* agg, const float val, const float skip_val) { \
676  return 0; \
677  }
678 
679 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
680  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
681  \
682  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
683  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
684  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
685  const int32_t val) {} \
686  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
687  const int16_t val) {} \
688  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
689  const int8_t val) {} \
690  \
691  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
692  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
693  \
694  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
695  const double val) {} \
696  \
697  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
698  int64_t* agg, const double val, const double skip_val) {} \
699  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
700  const float val) {} \
701  \
702  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
703  int32_t* agg, const float val, const float skip_val) {}
704 
709 
710 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
711  const int16_t val,
712  const int16_t skip_val) {}
713 
714 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
715  const int8_t val,
716  const int8_t skip_val) {}
717 
718 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
719  const int16_t val,
720  const int16_t skip_val) {}
721 
722 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
723  const int8_t val,
724  const int8_t skip_val) {}
725 
726 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
727 
728 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
729  return 0;
730 }
731 
732 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
733  const int64_t val,
734  const int64_t skip_val) {
735  return 0;
736 }
737 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
738  return 0;
739 }
740 
741 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
742  const int32_t val,
743  const int32_t skip_val) {
744  return 0;
745 }
746 
747 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
748 
749 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
750  const double val,
751  const double skip_val) {}
752 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
753 
754 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
755  const float val,
756  const float skip_val) {}
757 
758 extern "C" GPU_RT_STUB void force_sync() {}
759 
760 extern "C" GPU_RT_STUB void sync_warp() {}
761 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
762 
763 // x64 stride functions
764 
765 extern "C" __attribute__((noinline)) int32_t pos_start_impl(int32_t* error_code) {
766  int32_t row_index_resume{0};
767  if (error_code) {
768  row_index_resume = error_code[0];
769  error_code[0] = 0;
770  }
771  return row_index_resume;
772 }
773 
774 extern "C" __attribute__((noinline)) int32_t group_buff_idx_impl() {
775  return pos_start_impl(nullptr);
776 }
777 
778 extern "C" __attribute__((noinline)) int32_t pos_step_impl() {
779  return 1;
780 }
781 
782 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
783  return 0;
784 }
785 
786 #undef GPU_RT_STUB
787 
788 extern "C" ALWAYS_INLINE int32_t record_error_code(const int32_t err_code,
789  int32_t* error_codes) {
790  // NB: never override persistent error codes (with code greater than zero).
791  // On GPU, a projection query with a limit can run out of slots without it
792  // being an actual error if the limit has been hit. If a persistent error
793  // (division by zero, for example) occurs before running out of slots, we
794  // have to avoid overriding it, because there's a risk that the query would
795  // go through if we override with a potentially benign out-of-slots code.
796  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
797  error_codes[pos_start_impl(nullptr)] = err_code;
798  }
799  return err_code;
800 }
801 
802 // group by helpers
803 
804 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_nop(
805  const int64_t* groups_buffer,
806  const int32_t groups_buffer_size) {
807  return groups_buffer;
808 }
809 
810 extern "C" __attribute__((noinline)) void write_back_nop(int64_t* dest,
811  int64_t* src,
812  const int32_t sz) {
813  // the body is not really needed, just make sure the call is not optimized away
814  assert(dest);
815 }
816 
817 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem(
818  const int64_t* groups_buffer,
819  const int32_t groups_buffer_size) {
820  return init_shared_mem_nop(groups_buffer, groups_buffer_size);
821 }
822 
823 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_dynamic(
824  const int64_t* groups_buffer,
825  const int32_t groups_buffer_size) {
826  return nullptr;
827 }
828 
829 extern "C" __attribute__((noinline)) void write_back(int64_t* dest,
830  int64_t* src,
831  const int32_t sz) {
832  write_back_nop(dest, src, sz);
833 }
834 
835 extern "C" __attribute__((noinline)) void write_back_smem_nop(int64_t* dest,
836  int64_t* src,
837  const int32_t sz) {
838  assert(dest);
839 }
840 
841 extern "C" __attribute__((noinline)) void agg_from_smem_to_gmem_nop(int64_t* dest,
842  int64_t* src,
843  const int32_t sz) {
844  assert(dest);
845 }
846 
847 extern "C" __attribute__((noinline)) void
848 agg_from_smem_to_gmem_count_binId(int64_t* dest, int64_t* src, const int32_t sz) {
849  return agg_from_smem_to_gmem_nop(dest, src, sz);
850 }
851 
852 extern "C" __attribute__((noinline)) void
853 agg_from_smem_to_gmem_binId_count(int64_t* dest, int64_t* src, const int32_t sz) {
854  return agg_from_smem_to_gmem_nop(dest, src, sz);
855 }
856 
857 extern "C" __attribute__((noinline)) void init_group_by_buffer_gpu(
858  int64_t* groups_buffer,
859  const int64_t* init_vals,
860  const uint32_t groups_buffer_entry_count,
861  const uint32_t key_qw_count,
862  const uint32_t agg_col_count,
863  const bool keyless,
864  const int8_t warp_size) {
865  // the body is not really needed, just make sure the call is not optimized away
866  assert(groups_buffer);
867 }
868 
869 extern "C" __attribute__((noinline)) void init_columnar_group_by_buffer_gpu(
870  int64_t* groups_buffer,
871  const int64_t* init_vals,
872  const uint32_t groups_buffer_entry_count,
873  const uint32_t key_qw_count,
874  const uint32_t agg_col_count,
875  const bool keyless,
876  const bool blocks_share_memory,
877  const int32_t frag_idx) {
878  // the body is not really needed, just make sure the call is not optimized away
879  assert(groups_buffer);
880 }
881 
882 extern "C" __attribute__((noinline)) void init_group_by_buffer_impl(
883  int64_t* groups_buffer,
884  const int64_t* init_vals,
885  const uint32_t groups_buffer_entry_count,
886  const uint32_t key_qw_count,
887  const uint32_t agg_col_count,
888  const bool keyless,
889  const int8_t warp_size) {
890  // the body is not really needed, just make sure the call is not optimized away
891  assert(groups_buffer);
892 }
893 
894 template <typename T>
895 ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
896  const uint32_t h,
897  const T* key,
898  const uint32_t key_count,
899  const uint32_t row_size_quad) {
900  auto off = h * row_size_quad;
901  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
902  if (*row_ptr == get_empty_key<T>()) {
903  memcpy(row_ptr, key, key_count * sizeof(T));
904  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
905  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
906  }
907  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
908  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
909  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
910  }
911  return nullptr;
912 }
913 
914 extern "C" ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
915  const uint32_t h,
916  const int64_t* key,
917  const uint32_t key_count,
918  const uint32_t key_width,
919  const uint32_t row_size_quad,
920  const int64_t* init_vals) {
921  switch (key_width) {
922  case 4:
923  return get_matching_group_value(groups_buffer,
924  h,
925  reinterpret_cast<const int32_t*>(key),
926  key_count,
927  row_size_quad);
928  case 8:
929  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
930  default:;
931  }
932  return nullptr;
933 }
934 
935 template <typename T>
937  const uint32_t entry_count,
938  const uint32_t h,
939  const T* key,
940  const uint32_t key_count) {
941  auto off = h;
942  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
943  if (key_buffer[off] == get_empty_key<T>()) {
944  for (size_t i = 0; i < key_count; ++i) {
945  key_buffer[off] = key[i];
946  off += entry_count;
947  }
948  return h;
949  }
950  off = h;
951  for (size_t i = 0; i < key_count; ++i) {
952  if (key_buffer[off] != key[i]) {
953  return -1;
954  }
955  off += entry_count;
956  }
957  return h;
958 }
959 
960 extern "C" ALWAYS_INLINE int32_t
962  const uint32_t entry_count,
963  const uint32_t h,
964  const int64_t* key,
965  const uint32_t key_count,
966  const uint32_t key_width) {
967  switch (key_width) {
968  case 4:
969  return get_matching_group_value_columnar_slot(groups_buffer,
970  entry_count,
971  h,
972  reinterpret_cast<const int32_t*>(key),
973  key_count);
974  case 8:
976  groups_buffer, entry_count, h, key, key_count);
977  default:
978  return -1;
979  }
980  return -1;
981 }
982 
984  int64_t* groups_buffer,
985  const uint32_t h,
986  const int64_t* key,
987  const uint32_t key_qw_count,
988  const size_t entry_count) {
989  auto off = h;
990  if (groups_buffer[off] == EMPTY_KEY_64) {
991  for (size_t i = 0; i < key_qw_count; ++i) {
992  groups_buffer[off] = key[i];
993  off += entry_count;
994  }
995  return &groups_buffer[off];
996  }
997  off = h;
998  for (size_t i = 0; i < key_qw_count; ++i) {
999  if (groups_buffer[off] != key[i]) {
1000  return nullptr;
1001  }
1002  off += entry_count;
1003  }
1004  return &groups_buffer[off];
1005 }
1006 
1007 /*
1008  * For a particular hashed_index, returns the row-wise offset
1009  * to the first matching agg column in memory.
1010  * It also checks the corresponding group column, and initialize all
1011  * available keys if they are not empty (it is assumed all group columns are
1012  * 64-bit wide).
1013  *
1014  * Memory layout:
1015  *
1016  * | prepended group columns (64-bit each) | agg columns |
1017  */
1019  int64_t* groups_buffer,
1020  const uint32_t hashed_index,
1021  const int64_t* key,
1022  const uint32_t key_count,
1023  const uint32_t row_size_quad) {
1024  uint32_t off = hashed_index * row_size_quad;
1025  if (groups_buffer[off] == EMPTY_KEY_64) {
1026  for (uint32_t i = 0; i < key_count; ++i) {
1027  groups_buffer[off + i] = key[i];
1028  }
1029  }
1030  return groups_buffer + off + key_count;
1031 }
1032 
1033 /*
1034  * For a particular hashed_index, find and initialize (if necessary) all the group
1035  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1036  */
1038  int64_t* groups_buffer,
1039  const uint32_t hashed_index,
1040  const int64_t* key,
1041  const uint32_t key_count,
1042  const uint32_t entry_count) {
1043  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1044  for (uint32_t i = 0; i < key_count; i++) {
1045  groups_buffer[i * entry_count + hashed_index] = key[i];
1046  }
1047  }
1048 }
1049 
1050 #include "GroupByRuntime.cpp"
1052 
1054  int64_t* groups_buffer,
1055  const int64_t key,
1056  const int64_t min_key,
1057  const int64_t /* bucket */,
1058  const uint32_t row_size_quad) {
1059  return groups_buffer + row_size_quad * (key - min_key);
1060 }
1061 
1063  int64_t* groups_buffer,
1064  const int64_t key,
1065  const int64_t min_key,
1066  const int64_t /* bucket */,
1067  const uint32_t row_size_quad,
1068  const uint8_t thread_warp_idx,
1069  const uint8_t warp_size) {
1070  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1071 }
1072 
1073 extern "C" ALWAYS_INLINE int8_t* extract_str_ptr(const uint64_t str_and_len) {
1074  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1075 }
1076 
1077 extern "C" ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len) {
1078  return static_cast<int64_t>(str_and_len) >> 48;
1079 }
1080 
1081 extern "C" __attribute__((noinline)) int8_t* extract_str_ptr_noinline(
1082  const uint64_t str_and_len) {
1083  return extract_str_ptr(str_and_len);
1084 }
1085 
1086 extern "C" __attribute__((noinline)) int32_t extract_str_len_noinline(
1087  const uint64_t str_and_len) {
1088  return extract_str_len(str_and_len);
1089 }
1090 
1091 extern "C" ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr, const int32_t len) {
1092  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1093  (static_cast<const uint64_t>(len) << 48);
1094 }
1095 
1096 #ifdef __clang__
1097 #include "../Utils/StringLike.cpp"
1098 #endif
1099 
1100 #ifndef __CUDACC__
1101 #include "TopKRuntime.cpp"
1102 #endif
1103 
1104 extern "C" ALWAYS_INLINE DEVICE int32_t char_length(const char* str,
1105  const int32_t str_len) {
1106  return str_len;
1107 }
1108 
1109 extern "C" ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char* str,
1110  const int32_t str_len,
1111  const int32_t int_null) {
1112  if (!str) {
1113  return int_null;
1114  }
1115  return str_len;
1116 }
1117 
1118 extern "C" ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id) {
1119  return str_id;
1120 }
1121 
1122 extern "C" ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff,
1123  const int64_t pos) {
1124  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1125 }
1126 
1127 extern "C" ALWAYS_INLINE double percent_window_func(const int64_t output_buff,
1128  const int64_t pos) {
1129  return reinterpret_cast<const double*>(output_buff)[pos];
1130 }
1131 
1132 extern "C" ALWAYS_INLINE double load_double(const int64_t* agg) {
1133  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1134 }
1135 
1136 extern "C" ALWAYS_INLINE float load_float(const int32_t* agg) {
1137  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1138 }
1139 
1140 extern "C" ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1141  const int64_t* count,
1142  const double null_val) {
1143  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1144 }
1145 
1146 extern "C" ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1147  const int64_t* count,
1148  const double null_val,
1149  const uint32_t scale) {
1150  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1151 }
1152 
1153 extern "C" ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1154  const int64_t* count,
1155  const double null_val) {
1156  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1157  : null_val;
1158 }
1159 
1160 extern "C" ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1161  const int32_t* count,
1162  const double null_val) {
1163  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1164  : null_val;
1165 }
1166 
1167 extern "C" NEVER_INLINE void linear_probabilistic_count(uint8_t* bitmap,
1168  const uint32_t bitmap_bytes,
1169  const uint8_t* key_bytes,
1170  const uint32_t key_len) {
1171  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1172  const uint32_t word_idx = bit_pos / 32;
1173  const uint32_t bit_idx = bit_pos % 32;
1174  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1175 }
1176 
1177 extern "C" __attribute__((noinline)) void query_stub_hoisted_literals(
1178  const int8_t** col_buffers,
1179  const int8_t* literals,
1180  const int64_t* num_rows,
1181  const uint64_t* frag_row_offsets,
1182  const int32_t* max_matched,
1183  const int64_t* init_agg_value,
1184  int64_t** out,
1185  uint32_t frag_idx,
1186  const int64_t* join_hash_tables,
1187  int32_t* error_code,
1188  int32_t* total_matched) {
1189  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1190  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1191  total_matched);
1192 }
1193 
1194 extern "C" void multifrag_query_hoisted_literals(const int8_t*** col_buffers,
1195  const uint64_t* num_fragments,
1196  const int8_t* literals,
1197  const int64_t* num_rows,
1198  const uint64_t* frag_row_offsets,
1199  const int32_t* max_matched,
1200  int32_t* total_matched,
1201  const int64_t* init_agg_value,
1202  int64_t** out,
1203  int32_t* error_code,
1204  const uint32_t* num_tables_ptr,
1205  const int64_t* join_hash_tables) {
1206  for (uint32_t i = 0; i < *num_fragments; ++i) {
1207  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1208  literals,
1209  &num_rows[i * (*num_tables_ptr)],
1210  &frag_row_offsets[i * (*num_tables_ptr)],
1211  max_matched,
1212  init_agg_value,
1213  out,
1214  i,
1215  join_hash_tables,
1216  total_matched,
1217  error_code);
1218  }
1219 }
1220 
1221 extern "C" __attribute__((noinline)) void query_stub(const int8_t** col_buffers,
1222  const int64_t* num_rows,
1223  const uint64_t* frag_row_offsets,
1224  const int32_t* max_matched,
1225  const int64_t* init_agg_value,
1226  int64_t** out,
1227  uint32_t frag_idx,
1228  const int64_t* join_hash_tables,
1229  int32_t* error_code,
1230  int32_t* total_matched) {
1231  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1232  out || frag_idx || error_code || join_hash_tables || total_matched);
1233 }
1234 
1235 extern "C" void multifrag_query(const int8_t*** col_buffers,
1236  const uint64_t* num_fragments,
1237  const int64_t* num_rows,
1238  const uint64_t* frag_row_offsets,
1239  const int32_t* max_matched,
1240  int32_t* total_matched,
1241  const int64_t* init_agg_value,
1242  int64_t** out,
1243  int32_t* error_code,
1244  const uint32_t* num_tables_ptr,
1245  const int64_t* join_hash_tables) {
1246  for (uint32_t i = 0; i < *num_fragments; ++i) {
1247  query_stub(col_buffers ? col_buffers[i] : nullptr,
1248  &num_rows[i * (*num_tables_ptr)],
1249  &frag_row_offsets[i * (*num_tables_ptr)],
1250  max_matched,
1251  init_agg_value,
1252  out,
1253  i,
1254  join_hash_tables,
1255  total_matched,
1256  error_code);
1257  }
1258 }
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
ALWAYS_INLINE int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
#define DEF_UMINUS_NULLABLE(type, null_type)
GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t * join_hash_tables
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
GPU_RT_STUB void agg_sum_double_shared(int64_t *agg, const double val)
GPU_RT_STUB void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
#define EMPTY_KEY_64
GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
#define GPU_RT_STUB
const int8_t const int64_t * num_rows
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
ALWAYS_INLINE double load_double(const int64_t *agg)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
int64_t * src
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
int64_t const int32_t sz
GPU_RT_STUB void agg_count_distinct_bitmap_gpu(int64_t *, const int64_t, const int64_t, const int64_t, const int64_t, const uint64_t, const uint64_t)
const int32_t groups_buffer_size
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
#define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym)
void agg_min_int8(int8_t *agg, const int8_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
const int64_t const uint32_t groups_buffer_entry_count
GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t int32_t * total_matched
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
#define DEVICE
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
int32_t extract_str_len_noinline(const uint64_t str_and_len)
void agg_min_int16(int16_t *agg, const int16_t val)
GPU_RT_STUB int64_t agg_sum_shared(int64_t *agg, const int64_t val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
const int8_t const int64_t const uint64_t const int32_t * max_matched
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
#define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
GPU_RT_STUB void agg_approximate_count_distinct_gpu(int64_t *, const int64_t, const uint32_t, const int64_t, const int64_t)
NEVER_INLINE void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
GPU_RT_STUB void agg_count_distinct_bitmap_skip_val_gpu(int64_t *, const int64_t, const int64_t, const int64_t, const int64_t, const int64_t, const uint64_t, const uint64_t)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
GPU_RT_STUB void force_sync()
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool const int32_t frag_idx
GPU_RT_STUB void agg_id_double_shared_slow(int64_t *agg, const double *val)
#define DEF_SKIP_AGG(base_agg_func)
const int8_t const int64_t const uint64_t const int32_t const int64_t * init_agg_value
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
#define NEVER_INLINE
GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz)
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
const int8_t * literals
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool blocks_share_memory
GPU_RT_STUB void sync_warp()
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
ALWAYS_INLINE int32_t record_error_code(const int32_t err_code, int32_t *error_codes)
const int8_t const int64_t const uint64_t * frag_row_offsets
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
#define DEF_CMP_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
void agg_max_int8(int8_t *agg, const int8_t val)
__attribute__((noinline)) int32_t pos_start_impl(int32_t *error_code)
const int64_t * init_vals
ALWAYS_INLINE int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)