OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "Shared/quantile.h"
27 #include "TypePunning.h"
28 
29 #include <algorithm>
30 #include <atomic>
31 #include <chrono>
32 #include <cmath>
33 #include <cstdio>
34 #include <cstring>
35 #include <thread>
36 #include <tuple>
37 
38 // decoder implementations
39 
40 #include "DecodersImpl.h"
41 
42 // arithmetic operator implementations
43 
44 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
45  extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
46  const type lhs, const type rhs, const null_type null_val) { \
47  if (lhs != null_val && rhs != null_val) { \
48  return lhs opsym rhs; \
49  } \
50  return null_val; \
51  }
52 
53 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
54  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
55  const type lhs, const type rhs, const null_type null_val) { \
56  if (lhs != null_val) { \
57  return lhs opsym rhs; \
58  } \
59  return null_val; \
60  }
61 
62 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
63  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
64  const type lhs, const type rhs, const null_type null_val) { \
65  if (rhs != null_val) { \
66  return lhs opsym rhs; \
67  } \
68  return null_val; \
69  }
70 
71 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
72  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
73  const type lhs, \
74  const type rhs, \
75  const null_type null_val, \
76  const int8_t null_bool_val) { \
77  if (lhs != null_val && rhs != null_val) { \
78  return lhs opsym rhs; \
79  } \
80  return null_bool_val; \
81  }
82 
83 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
84  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
85  const type lhs, \
86  const type rhs, \
87  const null_type null_val, \
88  const int8_t null_bool_val) { \
89  if (lhs != null_val) { \
90  return lhs opsym rhs; \
91  } \
92  return null_bool_val; \
93  }
94 
95 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
96  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
97  const type lhs, \
98  const type rhs, \
99  const null_type null_val, \
100  const int8_t null_bool_val) { \
101  if (rhs != null_val) { \
102  return lhs opsym rhs; \
103  } \
104  return null_bool_val; \
105  }
106 
107 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
108  extern "C" ALWAYS_INLINE type safe_div_##type( \
109  const type lhs, const type rhs, const null_type null_val) { \
110  if (lhs != null_val && rhs != null_val && rhs != 0) { \
111  return lhs / rhs; \
112  } \
113  return null_val; \
114  }
115 
116 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
117  DEF_ARITH_NULLABLE(type, null_type, add, +) \
118  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
119  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
120  DEF_ARITH_NULLABLE(type, null_type, div, /) \
121  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
123  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
124  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
125  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
127  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
128  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
129  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
130  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
131  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
132  DEF_CMP_NULLABLE(type, null_type, lt, <) \
133  DEF_CMP_NULLABLE(type, null_type, gt, >) \
134  DEF_CMP_NULLABLE(type, null_type, le, <=) \
135  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
139  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
140  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
141  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
145  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
146  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
147  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
148 
149 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
150 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
151 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
152 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
153 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
154 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
155 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
156 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
165 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
166 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
167 
168 #undef DEF_BINARY_NULLABLE_ALL_OPS
169 #undef DEF_SAFE_DIV_NULLABLE
170 #undef DEF_CMP_NULLABLE_RHS
171 #undef DEF_CMP_NULLABLE_LHS
172 #undef DEF_CMP_NULLABLE
173 #undef DEF_ARITH_NULLABLE_RHS
174 #undef DEF_ARITH_NULLABLE_LHS
175 #undef DEF_ARITH_NULLABLE
176 
177 extern "C" ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand,
178  const uint64_t scale,
179  const int64_t operand_null_val,
180  const int64_t result_null_val) {
181  return operand != operand_null_val ? operand * scale : result_null_val;
182 }
183 
184 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand,
185  const int64_t scale,
186  const int64_t null_val) {
187  // rounded scale down of a decimal
188  if (operand == null_val) {
189  return null_val;
190  }
191 
192  int64_t tmp = scale >> 1;
193  tmp = operand >= 0 ? operand + tmp : operand - tmp;
194  return tmp / scale;
195 }
196 
197 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand,
198  const int64_t scale,
199  const int64_t null_val) {
200  int64_t tmp = scale >> 1;
201  tmp = operand >= 0 ? operand + tmp : operand - tmp;
202  return tmp / scale;
203 }
204 
205 // Return floor(dividend / divisor).
206 // Assumes 0 < divisor.
207 extern "C" ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend,
208  const int64_t divisor) {
209  return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
210 }
211 
212 // Return floor(dividend / divisor) or NULL if dividend IS NULL.
213 // Assumes 0 < divisor.
214 extern "C" ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend,
215  const int64_t divisor,
216  const int64_t null_val) {
217  return dividend == null_val ? null_val : floor_div_lhs(dividend, divisor);
218 }
219 
220 #define DEF_UMINUS_NULLABLE(type, null_type) \
221  extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
222  const null_type null_val) { \
223  return operand == null_val ? null_val : -operand; \
224  }
225 
226 DEF_UMINUS_NULLABLE(int8_t, int8_t)
227 DEF_UMINUS_NULLABLE(int16_t, int16_t)
228 DEF_UMINUS_NULLABLE(int32_t, int32_t)
229 DEF_UMINUS_NULLABLE(int64_t, int64_t)
230 DEF_UMINUS_NULLABLE(float, float)
231 DEF_UMINUS_NULLABLE(double, double)
232 
233 #undef DEF_UMINUS_NULLABLE
234 
235 #define DEF_CAST_NULLABLE(from_type, to_type) \
236  extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
237  const from_type operand, \
238  const from_type from_null_val, \
239  const to_type to_null_val) { \
240  return operand == from_null_val ? to_null_val : operand; \
241  }
242 
243 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
244  DEF_CAST_NULLABLE(type1, type2) \
245  DEF_CAST_NULLABLE(type2, type1)
246 
247 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
248 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
249 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
250 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
251 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
252 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
253 DEF_CAST_NULLABLE_BIDIR(float, double)
254 DEF_CAST_NULLABLE_BIDIR(float, int8_t)
255 DEF_CAST_NULLABLE_BIDIR(float, int16_t)
256 DEF_CAST_NULLABLE_BIDIR(float, int32_t)
257 DEF_CAST_NULLABLE_BIDIR(float, int64_t)
258 DEF_CAST_NULLABLE_BIDIR(double, int8_t)
259 DEF_CAST_NULLABLE_BIDIR(double, int16_t)
260 DEF_CAST_NULLABLE_BIDIR(double, int32_t)
261 DEF_CAST_NULLABLE_BIDIR(double, int64_t)
262 DEF_CAST_NULLABLE(uint8_t, int32_t)
263 DEF_CAST_NULLABLE(uint16_t, int32_t)
264 
265 #undef DEF_CAST_NULLABLE_BIDIR
266 #undef DEF_CAST_NULLABLE
267 
268 extern "C" ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val) {
269  return operand == null_val ? operand : (operand ? 0 : 1);
270 }
271 
272 extern "C" ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
273  const int8_t rhs,
274  const int8_t null_val) {
275  if (lhs == null_val) {
276  return rhs == 0 ? rhs : null_val;
277  }
278  if (rhs == null_val) {
279  return lhs == 0 ? lhs : null_val;
280  }
281  return (lhs && rhs) ? 1 : 0;
282 }
283 
284 extern "C" ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
285  const int8_t rhs,
286  const int8_t null_val) {
287  if (lhs == null_val) {
288  return rhs == 0 ? null_val : rhs;
289  }
290  if (rhs == null_val) {
291  return lhs == 0 ? null_val : lhs;
292  }
293  return (lhs || rhs) ? 1 : 0;
294 }
295 
296 // aggregator implementations
297 
298 extern "C" ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
299  return (*agg)++;
300 }
301 
302 extern "C" ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t* agg,
303  const int64_t val,
304  const int64_t min_val) {
305  const uint64_t bitmap_idx = val - min_val;
306  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
307 }
308 
309 #ifdef _MSC_VER
310 #define GPU_RT_STUB NEVER_INLINE
311 #else
312 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
313 #endif
314 
316  const int64_t,
317  const int64_t,
318  const int64_t,
319  const int64_t,
320  const uint64_t,
321  const uint64_t) {}
322 
323 extern "C" NEVER_INLINE void agg_approximate_count_distinct(int64_t* agg,
324  const int64_t key,
325  const uint32_t b) {
326  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
327  const uint32_t index = hash >> (64 - b);
328  const uint8_t rank = get_rank(hash << b, 64 - b);
329  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
330  M[index] = std::max(M[index], rank);
331 }
332 
334  const int64_t,
335  const uint32_t,
336  const int64_t,
337  const int64_t) {}
338 
339 extern "C" NEVER_INLINE void agg_approx_median(int64_t* agg, const double val) {
340  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
341  t_digest->add(val);
342 }
343 
344 extern "C" ALWAYS_INLINE void agg_approx_median_skip_val(int64_t* agg,
345  const double val,
346  const int64_t skip_val) {
347  if (*reinterpret_cast<int64_t const*>(may_alias_ptr(&val)) != skip_val) {
348  agg_approx_median(agg, val);
349  }
350 }
351 
352 extern "C" ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
353  const int64_t val,
354  const int64_t min_val,
355  const int64_t max_val,
356  const int64_t null_val,
357  const int8_t null_bool_val) {
358  if (val == null_val) {
359  return null_bool_val;
360  }
361  if (val < min_val || val > max_val) {
362  return 0;
363  }
364  if (!bitset) {
365  return 0;
366  }
367  const uint64_t bitmap_idx = val - min_val;
368  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
369  (1 << (bitmap_idx & 7))
370  ? 1
371  : 0;
372 }
373 
374 extern "C" ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
375  const auto old = *agg;
376  *agg += val;
377  return old;
378 }
379 
380 extern "C" ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
381  *agg = std::max(*agg, val);
382 }
383 
384 extern "C" ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
385  *agg = std::min(*agg, val);
386 }
387 
388 extern "C" ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
389  *agg = val;
390 }
391 
392 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id(int64_t* agg,
393  const int64_t val,
394  const int64_t null_val) {
395  if (val == null_val) {
396  return 0;
397  }
398 
399  if (*agg == val) {
400  return 0;
401  } else if (*agg == null_val) {
402  *agg = val;
403  return 0;
404  } else {
405  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
406  return 15;
407  }
408 }
409 
411  const int64_t val,
412  const int64_t min_val,
413  const int64_t skip_val) {
414  if (val != skip_val) {
415  agg_count_distinct_bitmap(agg, val, min_val);
416  }
417 }
418 
420  const int64_t,
421  const int64_t,
422  const int64_t,
423  const int64_t,
424  const int64_t,
425  const uint64_t,
426  const uint64_t) {}
427 
428 extern "C" ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg, const int32_t) {
429  return (*agg)++;
430 }
431 
432 extern "C" ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg, const int32_t val) {
433  const auto old = *agg;
434  *agg += val;
435  return old;
436 }
437 
438 #define DEF_AGG_MAX_INT(n) \
439  extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
440  *agg = std::max(*agg, val); \
441  }
442 
443 DEF_AGG_MAX_INT(32)
444 DEF_AGG_MAX_INT(16)
446 #undef DEF_AGG_MAX_INT
447 
448 #define DEF_AGG_MIN_INT(n) \
449  extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
450  *agg = std::min(*agg, val); \
451  }
452 
453 DEF_AGG_MIN_INT(32)
454 DEF_AGG_MIN_INT(16)
456 #undef DEF_AGG_MIN_INT
457 
458 #define DEF_AGG_ID_INT(n) \
459  extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
460  *agg = val; \
461  }
462 
463 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
464  extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
465  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
466  if (val == null_val) { \
467  return 0; \
468  } \
469  if (*agg == val) { \
470  return 0; \
471  } else if (*agg == null_val) { \
472  *agg = val; \
473  return 0; \
474  } else { \
475  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
476  return 15; \
477  } \
478  }
479 
480 DEF_AGG_ID_INT(32)
481 DEF_AGG_ID_INT(16)
483 
487 
488 #undef DEF_AGG_ID_INT
489 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
490 
491 #define DEF_WRITE_PROJECTION_INT(n) \
492  extern "C" ALWAYS_INLINE void write_projection_int##n( \
493  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
494  if (val != init_val) { \
495  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
496  } \
497  }
498 
501 #undef DEF_WRITE_PROJECTION_INT
502 
503 extern "C" ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
504  const int64_t val,
505  const int64_t skip_val) {
506  const auto old = *agg;
507  if (val != skip_val) {
508  if (old != skip_val) {
509  return agg_sum(agg, val);
510  } else {
511  *agg = val;
512  }
513  }
514  return old;
515 }
516 
517 extern "C" ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t* agg,
518  const int32_t val,
519  const int32_t skip_val) {
520  const auto old = *agg;
521  if (val != skip_val) {
522  if (old != skip_val) {
523  return agg_sum_int32(agg, val);
524  } else {
525  *agg = val;
526  }
527  }
528  return old;
529 }
530 
531 extern "C" ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t* agg,
532  const int64_t val,
533  const int64_t skip_val) {
534  if (val != skip_val) {
535  return agg_count(agg, val);
536  }
537  return *agg;
538 }
539 
540 extern "C" ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t* agg,
541  const int32_t val,
542  const int32_t skip_val) {
543  if (val != skip_val) {
544  return agg_count_int32(agg, val);
545  }
546  return *agg;
547 }
548 
549 #define DEF_SKIP_AGG_ADD(base_agg_func) \
550  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
551  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
552  if (val != skip_val) { \
553  base_agg_func(agg, val); \
554  } \
555  }
556 
557 #define DEF_SKIP_AGG(base_agg_func) \
558  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
559  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
560  if (val != skip_val) { \
561  const DATA_T old_agg = *agg; \
562  if (old_agg != skip_val) { \
563  base_agg_func(agg, val); \
564  } else { \
565  *agg = val; \
566  } \
567  } \
568  }
569 
570 #define DATA_T int64_t
573 #undef DATA_T
574 
575 #define DATA_T int32_t
578 #undef DATA_T
579 
580 #define DATA_T int16_t
583 #undef DATA_T
584 
585 #define DATA_T int8_t
588 #undef DATA_T
589 
590 #undef DEF_SKIP_AGG_ADD
591 #undef DEF_SKIP_AGG
592 
593 // TODO(alex): fix signature
594 
595 extern "C" ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg, const double val) {
596  return (*agg)++;
597 }
598 
599 extern "C" ALWAYS_INLINE void agg_sum_double(int64_t* agg, const double val) {
600  const auto r = *reinterpret_cast<const double*>(agg) + val;
601  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
602 }
603 
604 extern "C" ALWAYS_INLINE void agg_max_double(int64_t* agg, const double val) {
605  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
606  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
607 }
608 
609 extern "C" ALWAYS_INLINE void agg_min_double(int64_t* agg, const double val) {
610  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
611  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
612 }
613 
614 extern "C" ALWAYS_INLINE void agg_id_double(int64_t* agg, const double val) {
615  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
616 }
617 
618 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t* agg,
619  const double val,
620  const double null_val) {
621  if (val == null_val) {
622  return 0;
623  }
624 
625  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
626  return 0;
627  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
628  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
629  return 0;
630  } else {
631  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
632  return 15;
633  }
634 }
635 
636 extern "C" ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg, const float val) {
637  return (*agg)++;
638 }
639 
640 extern "C" ALWAYS_INLINE void agg_sum_float(int32_t* agg, const float val) {
641  const auto r = *reinterpret_cast<const float*>(agg) + val;
642  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
643 }
644 
645 extern "C" ALWAYS_INLINE void agg_max_float(int32_t* agg, const float val) {
646  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
647  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
648 }
649 
650 extern "C" ALWAYS_INLINE void agg_min_float(int32_t* agg, const float val) {
651  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
652  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
653 }
654 
655 extern "C" ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
656  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
657 }
658 
659 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t* agg,
660  const float val,
661  const float null_val) {
662  if (val == null_val) {
663  return 0;
664  }
665 
666  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
667  return 0;
668  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
669  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
670  return 0;
671  } else {
672  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
673  return 15;
674  }
675 }
676 
677 extern "C" ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t* agg,
678  const double val,
679  const double skip_val) {
680  if (val != skip_val) {
681  return agg_count_double(agg, val);
682  }
683  return *agg;
684 }
685 
686 extern "C" ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t* agg,
687  const float val,
688  const float skip_val) {
689  if (val != skip_val) {
690  return agg_count_float(agg, val);
691  }
692  return *agg;
693 }
694 
695 #define DEF_SKIP_AGG_ADD(base_agg_func) \
696  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
697  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
698  if (val != skip_val) { \
699  base_agg_func(agg, val); \
700  } \
701  }
702 
703 #define DEF_SKIP_AGG(base_agg_func) \
704  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
705  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
706  if (val != skip_val) { \
707  const ADDR_T old_agg = *agg; \
708  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
709  base_agg_func(agg, val); \
710  } else { \
711  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
712  } \
713  } \
714  }
715 
716 #define DATA_T double
717 #define ADDR_T int64_t
721 #undef ADDR_T
722 #undef DATA_T
723 
724 #define DATA_T float
725 #define ADDR_T int32_t
729 #undef ADDR_T
730 #undef DATA_T
731 
732 #undef DEF_SKIP_AGG_ADD
733 #undef DEF_SKIP_AGG
734 
735 extern "C" ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale) {
736  if (x >= 0) {
737  return x / scale * scale;
738  }
739  if (!(x % scale)) {
740  return x;
741  }
742  return x / scale * scale - scale;
743 }
744 
745 extern "C" ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale) {
746  return decimal_floor(x, scale) + (x % scale ? scale : 0);
747 }
748 
749 // Shared memory aggregators. Should never be called,
750 // real implementations are in cuda_mapd_rt.cu.
751 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
752  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
753  const int64_t val) { \
754  return 0; \
755  } \
756  \
757  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
758  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
759  return 0; \
760  } \
761  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
762  const int32_t val) { \
763  return 0; \
764  } \
765  \
766  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
767  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
768  return 0; \
769  } \
770  \
771  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
772  const double val) { \
773  return 0; \
774  } \
775  \
776  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
777  uint64_t* agg, const double val, const double skip_val) { \
778  return 0; \
779  } \
780  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
781  const float val) { \
782  return 0; \
783  } \
784  \
785  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
786  uint32_t* agg, const float val, const float skip_val) { \
787  return 0; \
788  }
789 
790 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
791  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
792  \
793  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
794  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
795  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
796  const int32_t val) {} \
797  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
798  const int16_t val) {} \
799  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
800  const int8_t val) {} \
801  \
802  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
803  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
804  \
805  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
806  const double val) {} \
807  \
808  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
809  int64_t* agg, const double val, const double skip_val) {} \
810  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
811  const float val) {} \
812  \
813  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
814  int32_t* agg, const float val, const float skip_val) {}
815 
820 
821 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
822  const int64_t val,
823  const int64_t null_val) {
824  return 0;
825 }
826 
827 extern "C" GPU_RT_STUB int32_t
829  const int32_t val,
830  const int32_t null_val) {
831  return 0;
832 }
833 extern "C" GPU_RT_STUB int32_t
835  const int16_t val,
836  const int16_t null_val) {
837  return 0;
838 }
839 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
840  const int8_t val,
841  const int8_t null_val) {
842  return 0;
843 }
844 
845 extern "C" GPU_RT_STUB int32_t
847  const double val,
848  const double null_val) {
849  return 0;
850 }
851 
852 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
853  const float val,
854  const float null_val) {
855  return 0;
856 }
857 
858 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
859  const int16_t val,
860  const int16_t skip_val) {}
861 
862 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
863  const int8_t val,
864  const int8_t skip_val) {}
865 
866 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
867  const int16_t val,
868  const int16_t skip_val) {}
869 
870 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
871  const int8_t val,
872  const int8_t skip_val) {}
873 
874 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
875 
876 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
877  return 0;
878 }
879 
880 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
881  const int64_t val,
882  const int64_t skip_val) {
883  return 0;
884 }
885 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
886  return 0;
887 }
888 
889 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
890  const int32_t val,
891  const int32_t skip_val) {
892  return 0;
893 }
894 
895 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
896 
897 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
898  const double val,
899  const double skip_val) {}
900 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
901 
902 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
903  const float val,
904  const float skip_val) {}
905 
906 extern "C" GPU_RT_STUB void force_sync() {}
907 
908 extern "C" GPU_RT_STUB void sync_warp() {}
909 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
910 extern "C" GPU_RT_STUB void sync_threadblock() {}
911 
912 extern "C" GPU_RT_STUB void write_back_non_grouped_agg(int64_t* input_buffer,
913  int64_t* output_buffer,
914  const int32_t num_agg_cols){};
915 // x64 stride functions
916 
917 extern "C" NEVER_INLINE int32_t pos_start_impl(int32_t* error_code) {
918  int32_t row_index_resume{0};
919  if (error_code) {
920  row_index_resume = error_code[0];
921  error_code[0] = 0;
922  }
923  return row_index_resume;
924 }
925 
926 extern "C" NEVER_INLINE int32_t group_buff_idx_impl() {
927  return pos_start_impl(nullptr);
928 }
929 
930 extern "C" NEVER_INLINE int32_t pos_step_impl() {
931  return 1;
932 }
933 
934 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
935  return 0;
936 }
937 
938 extern "C" GPU_RT_STUB int64_t get_thread_index() {
939  return 0;
940 }
941 
943  return nullptr;
944 }
945 
946 extern "C" GPU_RT_STUB int64_t get_block_index() {
947  return 0;
948 }
949 
950 #undef GPU_RT_STUB
951 
952 extern "C" ALWAYS_INLINE void record_error_code(const int32_t err_code,
953  int32_t* error_codes) {
954  // NB: never override persistent error codes (with code greater than zero).
955  // On GPU, a projection query with a limit can run out of slots without it
956  // being an actual error if the limit has been hit. If a persistent error
957  // (division by zero, for example) occurs before running out of slots, we
958  // have to avoid overriding it, because there's a risk that the query would
959  // go through if we override with a potentially benign out-of-slots code.
960  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
961  error_codes[pos_start_impl(nullptr)] = err_code;
962  }
963 }
964 
965 extern "C" ALWAYS_INLINE int32_t get_error_code(int32_t* error_codes) {
966  return error_codes[pos_start_impl(nullptr)];
967 }
968 
969 // group by helpers
970 
971 extern "C" NEVER_INLINE const int64_t* init_shared_mem_nop(
972  const int64_t* groups_buffer,
973  const int32_t groups_buffer_size) {
974  return groups_buffer;
975 }
976 
977 extern "C" NEVER_INLINE void write_back_nop(int64_t* dest,
978  int64_t* src,
979  const int32_t sz) {
980  // the body is not really needed, just make sure the call is not optimized away
981  assert(dest);
982 }
983 
984 extern "C" int64_t* init_shared_mem(const int64_t* global_groups_buffer,
985  const int32_t groups_buffer_size) {
986  return nullptr;
987 }
988 
990  int64_t* groups_buffer,
991  const int64_t* init_vals,
992  const uint32_t groups_buffer_entry_count,
993  const uint32_t key_qw_count,
994  const uint32_t agg_col_count,
995  const bool keyless,
996  const int8_t warp_size) {
997  // the body is not really needed, just make sure the call is not optimized away
998  assert(groups_buffer);
999 }
1000 
1002  int64_t* groups_buffer,
1003  const int64_t* init_vals,
1004  const uint32_t groups_buffer_entry_count,
1005  const uint32_t key_qw_count,
1006  const uint32_t agg_col_count,
1007  const bool keyless,
1008  const bool blocks_share_memory,
1009  const int32_t frag_idx) {
1010  // the body is not really needed, just make sure the call is not optimized away
1011  assert(groups_buffer);
1012 }
1013 
1015  int64_t* groups_buffer,
1016  const int64_t* init_vals,
1017  const uint32_t groups_buffer_entry_count,
1018  const uint32_t key_qw_count,
1019  const uint32_t agg_col_count,
1020  const bool keyless,
1021  const int8_t warp_size) {
1022  // the body is not really needed, just make sure the call is not optimized away
1023  assert(groups_buffer);
1024 }
1025 
1026 template <typename T>
1027 ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
1028  const uint32_t h,
1029  const T* key,
1030  const uint32_t key_count,
1031  const uint32_t row_size_quad) {
1032  auto off = h * row_size_quad;
1033  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1034  if (*row_ptr == get_empty_key<T>()) {
1035  memcpy(row_ptr, key, key_count * sizeof(T));
1036  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1037  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1038  }
1039  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1040  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1041  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1042  }
1043  return nullptr;
1044 }
1045 
1046 extern "C" ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
1047  const uint32_t h,
1048  const int64_t* key,
1049  const uint32_t key_count,
1050  const uint32_t key_width,
1051  const uint32_t row_size_quad) {
1052  switch (key_width) {
1053  case 4:
1054  return get_matching_group_value(groups_buffer,
1055  h,
1056  reinterpret_cast<const int32_t*>(key),
1057  key_count,
1058  row_size_quad);
1059  case 8:
1060  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1061  default:;
1062  }
1063  return nullptr;
1064 }
1065 
1066 template <typename T>
1068  const uint32_t entry_count,
1069  const uint32_t h,
1070  const T* key,
1071  const uint32_t key_count) {
1072  auto off = h;
1073  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1074  if (key_buffer[off] == get_empty_key<T>()) {
1075  for (size_t i = 0; i < key_count; ++i) {
1076  key_buffer[off] = key[i];
1077  off += entry_count;
1078  }
1079  return h;
1080  }
1081  off = h;
1082  for (size_t i = 0; i < key_count; ++i) {
1083  if (key_buffer[off] != key[i]) {
1084  return -1;
1085  }
1086  off += entry_count;
1087  }
1088  return h;
1089 }
1090 
1091 extern "C" ALWAYS_INLINE int32_t
1093  const uint32_t entry_count,
1094  const uint32_t h,
1095  const int64_t* key,
1096  const uint32_t key_count,
1097  const uint32_t key_width) {
1098  switch (key_width) {
1099  case 4:
1100  return get_matching_group_value_columnar_slot(groups_buffer,
1101  entry_count,
1102  h,
1103  reinterpret_cast<const int32_t*>(key),
1104  key_count);
1105  case 8:
1107  groups_buffer, entry_count, h, key, key_count);
1108  default:
1109  return -1;
1110  }
1111  return -1;
1112 }
1113 
1115  int64_t* groups_buffer,
1116  const uint32_t h,
1117  const int64_t* key,
1118  const uint32_t key_qw_count,
1119  const size_t entry_count) {
1120  auto off = h;
1121  if (groups_buffer[off] == EMPTY_KEY_64) {
1122  for (size_t i = 0; i < key_qw_count; ++i) {
1123  groups_buffer[off] = key[i];
1124  off += entry_count;
1125  }
1126  return &groups_buffer[off];
1127  }
1128  off = h;
1129  for (size_t i = 0; i < key_qw_count; ++i) {
1130  if (groups_buffer[off] != key[i]) {
1131  return nullptr;
1132  }
1133  off += entry_count;
1134  }
1135  return &groups_buffer[off];
1136 }
1137 
1138 /*
1139  * For a particular hashed_index, returns the row-wise offset
1140  * to the first matching agg column in memory.
1141  * It also checks the corresponding group column, and initialize all
1142  * available keys if they are not empty (it is assumed all group columns are
1143  * 64-bit wide).
1144  *
1145  * Memory layout:
1146  *
1147  * | prepended group columns (64-bit each) | agg columns |
1148  */
1150  int64_t* groups_buffer,
1151  const uint32_t hashed_index,
1152  const int64_t* key,
1153  const uint32_t key_count,
1154  const uint32_t row_size_quad) {
1155  uint32_t off = hashed_index * row_size_quad;
1156  if (groups_buffer[off] == EMPTY_KEY_64) {
1157  for (uint32_t i = 0; i < key_count; ++i) {
1158  groups_buffer[off + i] = key[i];
1159  }
1160  }
1161  return groups_buffer + off + key_count;
1162 }
1163 
1171  int64_t* groups_buffer,
1172  const uint32_t hashed_index,
1173  const uint32_t row_size_quad) {
1174  return groups_buffer + row_size_quad * hashed_index;
1175 }
1176 
1177 /*
1178  * For a particular hashed_index, find and initialize (if necessary) all the group
1179  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1180  */
1182  int64_t* groups_buffer,
1183  const uint32_t hashed_index,
1184  const int64_t* key,
1185  const uint32_t key_count,
1186  const uint32_t entry_count) {
1187  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1188  for (uint32_t i = 0; i < key_count; i++) {
1189  groups_buffer[i * entry_count + hashed_index] = key[i];
1190  }
1191  }
1192 }
1193 
1194 #include "GroupByRuntime.cpp"
1196 
1198  int64_t* groups_buffer,
1199  const int64_t key,
1200  const int64_t min_key,
1201  const int64_t /* bucket */,
1202  const uint32_t row_size_quad) {
1203  return groups_buffer + row_size_quad * (key - min_key);
1204 }
1205 
1207  int64_t* groups_buffer,
1208  const int64_t key,
1209  const int64_t min_key,
1210  const int64_t /* bucket */,
1211  const uint32_t row_size_quad,
1212  const uint8_t thread_warp_idx,
1213  const uint8_t warp_size) {
1214  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1215 }
1216 
1217 extern "C" ALWAYS_INLINE int8_t* extract_str_ptr(const uint64_t str_and_len) {
1218  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1219 }
1220 
1221 extern "C" ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len) {
1222  return static_cast<int64_t>(str_and_len) >> 48;
1223 }
1224 
1225 extern "C" NEVER_INLINE int8_t* extract_str_ptr_noinline(const uint64_t str_and_len) {
1226  return extract_str_ptr(str_and_len);
1227 }
1228 
1229 extern "C" NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len) {
1230  return extract_str_len(str_and_len);
1231 }
1232 
1233 extern "C" ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr, const int32_t len) {
1234  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1235  (static_cast<const uint64_t>(len) << 48);
1236 }
1237 
1238 #ifdef __clang__
1239 #include "../Utils/StringLike.cpp"
1240 #endif
1241 
1242 #ifndef __CUDACC__
1243 #include "TopKRuntime.cpp"
1244 #endif
1245 
1246 extern "C" ALWAYS_INLINE DEVICE int32_t char_length(const char* str,
1247  const int32_t str_len) {
1248  return str_len;
1249 }
1250 
1251 extern "C" ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char* str,
1252  const int32_t str_len,
1253  const int32_t int_null) {
1254  if (!str) {
1255  return int_null;
1256  }
1257  return str_len;
1258 }
1259 
1260 extern "C" ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id) {
1261  return str_id;
1262 }
1263 
1264 extern "C" ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion,
1265  const int64_t row_offset) {
1266  const int64_t threshold = 4294967296 * proportion;
1267  return (row_offset * 2654435761) % 4294967296 < threshold;
1268 }
1269 
1270 extern "C" ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff,
1271  const int64_t pos) {
1272  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1273 }
1274 
1275 extern "C" ALWAYS_INLINE double percent_window_func(const int64_t output_buff,
1276  const int64_t pos) {
1277  return reinterpret_cast<const double*>(output_buff)[pos];
1278 }
1279 
1280 extern "C" ALWAYS_INLINE double load_double(const int64_t* agg) {
1281  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1282 }
1283 
1284 extern "C" ALWAYS_INLINE float load_float(const int32_t* agg) {
1285  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1286 }
1287 
1288 extern "C" ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1289  const int64_t* count,
1290  const double null_val) {
1291  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1292 }
1293 
1294 extern "C" ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1295  const int64_t* count,
1296  const double null_val,
1297  const uint32_t scale) {
1298  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1299 }
1300 
1301 extern "C" ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1302  const int64_t* count,
1303  const double null_val) {
1304  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1305  : null_val;
1306 }
1307 
1308 extern "C" ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1309  const int32_t* count,
1310  const double null_val) {
1311  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1312  : null_val;
1313 }
1314 
1315 extern "C" NEVER_INLINE void linear_probabilistic_count(uint8_t* bitmap,
1316  const uint32_t bitmap_bytes,
1317  const uint8_t* key_bytes,
1318  const uint32_t key_len) {
1319  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1320  const uint32_t word_idx = bit_pos / 32;
1321  const uint32_t bit_idx = bit_pos % 32;
1322  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1323 }
1324 
1325 extern "C" NEVER_INLINE void query_stub_hoisted_literals(const int8_t** col_buffers,
1326  const int8_t* literals,
1327  const int64_t* num_rows,
1328  const uint64_t* frag_row_offsets,
1329  const int32_t* max_matched,
1330  const int64_t* init_agg_value,
1331  int64_t** out,
1332  uint32_t frag_idx,
1333  const int64_t* join_hash_tables,
1334  int32_t* error_code,
1335  int32_t* total_matched) {
1336  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1337  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1338  total_matched);
1339 }
1340 
1341 extern "C" void multifrag_query_hoisted_literals(const int8_t*** col_buffers,
1342  const uint64_t* num_fragments,
1343  const int8_t* literals,
1344  const int64_t* num_rows,
1345  const uint64_t* frag_row_offsets,
1346  const int32_t* max_matched,
1347  int32_t* total_matched,
1348  const int64_t* init_agg_value,
1349  int64_t** out,
1350  int32_t* error_code,
1351  const uint32_t* num_tables_ptr,
1352  const int64_t* join_hash_tables) {
1353  for (uint32_t i = 0; i < *num_fragments; ++i) {
1354  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1355  literals,
1356  &num_rows[i * (*num_tables_ptr)],
1357  &frag_row_offsets[i * (*num_tables_ptr)],
1358  max_matched,
1359  init_agg_value,
1360  out,
1361  i,
1362  join_hash_tables,
1363  total_matched,
1364  error_code);
1365  }
1366 }
1367 
1368 extern "C" NEVER_INLINE void query_stub(const int8_t** col_buffers,
1369  const int64_t* num_rows,
1370  const uint64_t* frag_row_offsets,
1371  const int32_t* max_matched,
1372  const int64_t* init_agg_value,
1373  int64_t** out,
1374  uint32_t frag_idx,
1375  const int64_t* join_hash_tables,
1376  int32_t* error_code,
1377  int32_t* total_matched) {
1378  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1379  out || frag_idx || error_code || join_hash_tables || total_matched);
1380 }
1381 
1382 extern "C" void multifrag_query(const int8_t*** col_buffers,
1383  const uint64_t* num_fragments,
1384  const int64_t* num_rows,
1385  const uint64_t* frag_row_offsets,
1386  const int32_t* max_matched,
1387  int32_t* total_matched,
1388  const int64_t* init_agg_value,
1389  int64_t** out,
1390  int32_t* error_code,
1391  const uint32_t* num_tables_ptr,
1392  const int64_t* join_hash_tables) {
1393  for (uint32_t i = 0; i < *num_fragments; ++i) {
1394  query_stub(col_buffers ? col_buffers[i] : nullptr,
1395  &num_rows[i * (*num_tables_ptr)],
1396  &frag_row_offsets[i * (*num_tables_ptr)],
1397  max_matched,
1398  init_agg_value,
1399  out,
1400  i,
1401  join_hash_tables,
1402  total_matched,
1403  error_code);
1404  }
1405 }
1406 
1408  if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
1409  return true;
1410  }
1411  return false;
1412 }
1413 
1414 extern "C" bool check_interrupt_init(unsigned command) {
1415  static std::atomic_bool runtime_interrupt_flag{false};
1416 
1417  if (command == static_cast<unsigned>(INT_CHECK)) {
1418  if (runtime_interrupt_flag.load()) {
1419  return true;
1420  }
1421  return false;
1422  }
1423  if (command == static_cast<unsigned>(INT_ABORT)) {
1424  runtime_interrupt_flag.store(true);
1425  return false;
1426  }
1427  if (command == static_cast<unsigned>(INT_RESET)) {
1428  runtime_interrupt_flag.store(false);
1429  return false;
1430  }
1431  return false;
1432 }
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
#define DEF_UMINUS_NULLABLE(type, null_type)
ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
#define EMPTY_KEY_64
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:50
#define GPU_RT_STUB
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:40
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
__device__ int64_t get_thread_index()
Definition: cuda_mapd_rt.cu:20
DEVICE void add(RealType value)
Definition: quantile.h:555
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:36
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:44
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
ALWAYS_INLINE void agg_approx_median_skip_val(int64_t *agg, const double val, const int64_t skip_val)
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int64_t * declare_dynamic_shared_memory()
Definition: cuda_mapd_rt.cu:57
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
#define DEVICE
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
Definition: cuda_mapd_rt.cu:24
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
void agg_min_int16(int16_t *agg, const int16_t val)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
NEVER_INLINE void agg_approx_median(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:28
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:96
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
NEVER_INLINE void query_stub(const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:67
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
bool check_interrupt_init(unsigned command)
#define NEVER_INLINE
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
NEVER_INLINE void query_stub_hoisted_literals(const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void sync_threadblock()
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
void agg_max_int8(int8_t *agg, const int8_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:32