OmniSciDB  06b3bd477c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "TypePunning.h"
27 
28 #include <algorithm>
29 #include <atomic>
30 #include <chrono>
31 #include <cmath>
32 #include <cstring>
33 #include <thread>
34 #include <tuple>
35 
36 // decoder implementations
37 
38 #include "DecodersImpl.h"
39 
40 // arithmetic operator implementations
41 
42 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
43  extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
44  const type lhs, const type rhs, const null_type null_val) { \
45  if (lhs != null_val && rhs != null_val) { \
46  return lhs opsym rhs; \
47  } \
48  return null_val; \
49  }
50 
51 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
52  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
53  const type lhs, const type rhs, const null_type null_val) { \
54  if (lhs != null_val) { \
55  return lhs opsym rhs; \
56  } \
57  return null_val; \
58  }
59 
60 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
61  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
62  const type lhs, const type rhs, const null_type null_val) { \
63  if (rhs != null_val) { \
64  return lhs opsym rhs; \
65  } \
66  return null_val; \
67  }
68 
69 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
70  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
71  const type lhs, \
72  const type rhs, \
73  const null_type null_val, \
74  const int8_t null_bool_val) { \
75  if (lhs != null_val && rhs != null_val) { \
76  return lhs opsym rhs; \
77  } \
78  return null_bool_val; \
79  }
80 
81 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
82  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
83  const type lhs, \
84  const type rhs, \
85  const null_type null_val, \
86  const int8_t null_bool_val) { \
87  if (lhs != null_val) { \
88  return lhs opsym rhs; \
89  } \
90  return null_bool_val; \
91  }
92 
93 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
94  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
95  const type lhs, \
96  const type rhs, \
97  const null_type null_val, \
98  const int8_t null_bool_val) { \
99  if (rhs != null_val) { \
100  return lhs opsym rhs; \
101  } \
102  return null_bool_val; \
103  }
104 
105 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
106  extern "C" ALWAYS_INLINE type safe_div_##type( \
107  const type lhs, const type rhs, const null_type null_val) { \
108  if (lhs != null_val && rhs != null_val && rhs != 0) { \
109  return lhs / rhs; \
110  } \
111  return null_val; \
112  }
113 
114 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
115  DEF_ARITH_NULLABLE(type, null_type, add, +) \
116  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
117  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
118  DEF_ARITH_NULLABLE(type, null_type, div, /) \
119  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
120  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
121  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
123  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
124  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
125  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
127  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
128  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
129  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
130  DEF_CMP_NULLABLE(type, null_type, lt, <) \
131  DEF_CMP_NULLABLE(type, null_type, gt, >) \
132  DEF_CMP_NULLABLE(type, null_type, le, <=) \
133  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
134  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
135  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
139  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
140  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
141  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
145  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
146 
147 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
148 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
149 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
150 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
151 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
152 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
153 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
154 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
155 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
156 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
165 
166 #undef DEF_BINARY_NULLABLE_ALL_OPS
167 #undef DEF_SAFE_DIV_NULLABLE
168 #undef DEF_CMP_NULLABLE_RHS
169 #undef DEF_CMP_NULLABLE_LHS
170 #undef DEF_CMP_NULLABLE
171 #undef DEF_ARITH_NULLABLE_RHS
172 #undef DEF_ARITH_NULLABLE_LHS
173 #undef DEF_ARITH_NULLABLE
174 
175 extern "C" ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand,
176  const uint64_t scale,
177  const int64_t operand_null_val,
178  const int64_t result_null_val) {
179  return operand != operand_null_val ? operand * scale : result_null_val;
180 }
181 
182 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand,
183  const int64_t scale,
184  const int64_t null_val) {
185  // rounded scale down of a decimal
186  if (operand == null_val) {
187  return null_val;
188  }
189 
190  int64_t tmp = scale >> 1;
191  tmp = operand >= 0 ? operand + tmp : operand - tmp;
192  return tmp / scale;
193 }
194 
195 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand,
196  const int64_t scale,
197  const int64_t null_val) {
198  int64_t tmp = scale >> 1;
199  tmp = operand >= 0 ? operand + tmp : operand - tmp;
200  return tmp / scale;
201 }
202 
203 // Return floor(dividend / divisor) or NULL if dividend IS NULL.
204 // Assumes 0 < divisor.
205 extern "C" ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend,
206  const int64_t divisor,
207  const int64_t null_val) {
208  if (dividend == null_val) {
209  return null_val;
210  } else {
211  return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
212  }
213 }
214 
215 #define DEF_UMINUS_NULLABLE(type, null_type) \
216  extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
217  const null_type null_val) { \
218  return operand == null_val ? null_val : -operand; \
219  }
220 
221 DEF_UMINUS_NULLABLE(int8_t, int8_t)
222 DEF_UMINUS_NULLABLE(int16_t, int16_t)
223 DEF_UMINUS_NULLABLE(int32_t, int32_t)
224 DEF_UMINUS_NULLABLE(int64_t, int64_t)
225 DEF_UMINUS_NULLABLE(float, float)
226 DEF_UMINUS_NULLABLE(double, double)
227 
228 #undef DEF_UMINUS_NULLABLE
229 
230 #define DEF_CAST_NULLABLE(from_type, to_type) \
231  extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
232  const from_type operand, \
233  const from_type from_null_val, \
234  const to_type to_null_val) { \
235  return operand == from_null_val ? to_null_val : operand; \
236  }
237 
238 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
239  DEF_CAST_NULLABLE(type1, type2) \
240  DEF_CAST_NULLABLE(type2, type1)
241 
242 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
243 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
244 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
245 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
246 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
247 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
248 DEF_CAST_NULLABLE_BIDIR(float, double)
249 DEF_CAST_NULLABLE_BIDIR(float, int8_t)
250 DEF_CAST_NULLABLE_BIDIR(float, int16_t)
251 DEF_CAST_NULLABLE_BIDIR(float, int32_t)
252 DEF_CAST_NULLABLE_BIDIR(float, int64_t)
253 DEF_CAST_NULLABLE_BIDIR(double, int8_t)
254 DEF_CAST_NULLABLE_BIDIR(double, int16_t)
255 DEF_CAST_NULLABLE_BIDIR(double, int32_t)
256 DEF_CAST_NULLABLE_BIDIR(double, int64_t)
257 DEF_CAST_NULLABLE(uint8_t, int32_t)
258 DEF_CAST_NULLABLE(uint16_t, int32_t)
259 
260 #undef DEF_CAST_NULLABLE_BIDIR
261 #undef DEF_CAST_NULLABLE
262 
263 extern "C" ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val) {
264  return operand == null_val ? operand : (operand ? 0 : 1);
265 }
266 
267 extern "C" ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
268  const int8_t rhs,
269  const int8_t null_val) {
270  if (lhs == null_val) {
271  return rhs == 0 ? rhs : null_val;
272  }
273  if (rhs == null_val) {
274  return lhs == 0 ? lhs : null_val;
275  }
276  return (lhs && rhs) ? 1 : 0;
277 }
278 
279 extern "C" ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
280  const int8_t rhs,
281  const int8_t null_val) {
282  if (lhs == null_val) {
283  return rhs == 0 ? null_val : rhs;
284  }
285  if (rhs == null_val) {
286  return lhs == 0 ? null_val : lhs;
287  }
288  return (lhs || rhs) ? 1 : 0;
289 }
290 
291 // aggregator implementations
292 
293 extern "C" ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
294  return (*agg)++;
295 }
296 
297 extern "C" ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t* agg,
298  const int64_t val,
299  const int64_t min_val) {
300  const uint64_t bitmap_idx = val - min_val;
301  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
302 }
303 
304 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
305 
307  const int64_t,
308  const int64_t,
309  const int64_t,
310  const int64_t,
311  const uint64_t,
312  const uint64_t) {}
313 
314 extern "C" NEVER_INLINE void agg_approximate_count_distinct(int64_t* agg,
315  const int64_t key,
316  const uint32_t b) {
317  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
318  const uint32_t index = hash >> (64 - b);
319  const uint8_t rank = get_rank(hash << b, 64 - b);
320  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
321  M[index] = std::max(M[index], rank);
322 }
323 
325  const int64_t,
326  const uint32_t,
327  const int64_t,
328  const int64_t) {}
329 
330 extern "C" ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
331  const int64_t val,
332  const int64_t min_val,
333  const int64_t max_val,
334  const int64_t null_val,
335  const int8_t null_bool_val) {
336  if (val == null_val) {
337  return null_bool_val;
338  }
339  if (val < min_val || val > max_val) {
340  return 0;
341  }
342  if (!bitset) {
343  return 0;
344  }
345  const uint64_t bitmap_idx = val - min_val;
346  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
347  (1 << (bitmap_idx & 7))
348  ? 1
349  : 0;
350 }
351 
352 extern "C" ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
353  const auto old = *agg;
354  *agg += val;
355  return old;
356 }
357 
358 extern "C" ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
359  *agg = std::max(*agg, val);
360 }
361 
362 extern "C" ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
363  *agg = std::min(*agg, val);
364 }
365 
366 extern "C" ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
367  *agg = val;
368 }
369 
370 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id(int64_t* agg,
371  const int64_t val,
372  const int64_t null_val) {
373  if (val == null_val) {
374  return 0;
375  }
376 
377  if (*agg == val) {
378  return 0;
379  } else if (*agg == null_val) {
380  *agg = val;
381  return 0;
382  } else {
383  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
384  return 15;
385  }
386 }
387 
389  const int64_t val,
390  const int64_t min_val,
391  const int64_t skip_val) {
392  if (val != skip_val) {
393  agg_count_distinct_bitmap(agg, val, min_val);
394  }
395 }
396 
398  const int64_t,
399  const int64_t,
400  const int64_t,
401  const int64_t,
402  const int64_t,
403  const uint64_t,
404  const uint64_t) {}
405 
406 extern "C" ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg, const int32_t) {
407  return (*agg)++;
408 }
409 
410 extern "C" ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg, const int32_t val) {
411  const auto old = *agg;
412  *agg += val;
413  return old;
414 }
415 
416 #define DEF_AGG_MAX_INT(n) \
417  extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
418  *agg = std::max(*agg, val); \
419  }
420 
421 DEF_AGG_MAX_INT(32)
422 DEF_AGG_MAX_INT(16)
424 #undef DEF_AGG_MAX_INT
425 
426 #define DEF_AGG_MIN_INT(n) \
427  extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
428  *agg = std::min(*agg, val); \
429  }
430 
431 DEF_AGG_MIN_INT(32)
432 DEF_AGG_MIN_INT(16)
434 #undef DEF_AGG_MIN_INT
435 
436 #define DEF_AGG_ID_INT(n) \
437  extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
438  *agg = val; \
439  }
440 
441 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
442  extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
443  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
444  if (val == null_val) { \
445  return 0; \
446  } \
447  if (*agg == val) { \
448  return 0; \
449  } else if (*agg == null_val) { \
450  *agg = val; \
451  return 0; \
452  } else { \
453  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
454  return 15; \
455  } \
456  }
457 
458 DEF_AGG_ID_INT(32)
459 DEF_AGG_ID_INT(16)
461 
465 
466 #undef DEF_AGG_ID_INT
467 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
468 
469 #define DEF_WRITE_PROJECTION_INT(n) \
470  extern "C" ALWAYS_INLINE void write_projection_int##n( \
471  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
472  if (val != init_val) { \
473  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
474  } \
475  }
476 
479 #undef DEF_WRITE_PROJECTION_INT
480 
481 extern "C" ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
482  const int64_t val,
483  const int64_t skip_val) {
484  const auto old = *agg;
485  if (val != skip_val) {
486  if (old != skip_val) {
487  return agg_sum(agg, val);
488  } else {
489  *agg = val;
490  }
491  }
492  return old;
493 }
494 
495 extern "C" ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t* agg,
496  const int32_t val,
497  const int32_t skip_val) {
498  const auto old = *agg;
499  if (val != skip_val) {
500  if (old != skip_val) {
501  return agg_sum_int32(agg, val);
502  } else {
503  *agg = val;
504  }
505  }
506  return old;
507 }
508 
509 extern "C" ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t* agg,
510  const int64_t val,
511  const int64_t skip_val) {
512  if (val != skip_val) {
513  return agg_count(agg, val);
514  }
515  return *agg;
516 }
517 
518 extern "C" ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t* agg,
519  const int32_t val,
520  const int32_t skip_val) {
521  if (val != skip_val) {
522  return agg_count_int32(agg, val);
523  }
524  return *agg;
525 }
526 
527 #define DEF_SKIP_AGG_ADD(base_agg_func) \
528  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
529  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
530  if (val != skip_val) { \
531  base_agg_func(agg, val); \
532  } \
533  }
534 
535 #define DEF_SKIP_AGG(base_agg_func) \
536  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
537  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
538  if (val != skip_val) { \
539  const DATA_T old_agg = *agg; \
540  if (old_agg != skip_val) { \
541  base_agg_func(agg, val); \
542  } else { \
543  *agg = val; \
544  } \
545  } \
546  }
547 
548 #define DATA_T int64_t
551 #undef DATA_T
552 
553 #define DATA_T int32_t
556 #undef DATA_T
557 
558 #define DATA_T int16_t
561 #undef DATA_T
562 
563 #define DATA_T int8_t
566 #undef DATA_T
567 
568 #undef DEF_SKIP_AGG_ADD
569 #undef DEF_SKIP_AGG
570 
571 // TODO(alex): fix signature
572 
573 extern "C" ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg, const double val) {
574  return (*agg)++;
575 }
576 
577 extern "C" ALWAYS_INLINE void agg_sum_double(int64_t* agg, const double val) {
578  const auto r = *reinterpret_cast<const double*>(agg) + val;
579  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
580 }
581 
582 extern "C" ALWAYS_INLINE void agg_max_double(int64_t* agg, const double val) {
583  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
584  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
585 }
586 
587 extern "C" ALWAYS_INLINE void agg_min_double(int64_t* agg, const double val) {
588  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
589  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
590 }
591 
592 extern "C" ALWAYS_INLINE void agg_id_double(int64_t* agg, const double val) {
593  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
594 }
595 
596 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t* agg,
597  const double val,
598  const double null_val) {
599  if (val == null_val) {
600  return 0;
601  }
602 
603  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
604  return 0;
605  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
606  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
607  return 0;
608  } else {
609  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
610  return 15;
611  }
612 }
613 
614 extern "C" ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg, const float val) {
615  return (*agg)++;
616 }
617 
618 extern "C" ALWAYS_INLINE void agg_sum_float(int32_t* agg, const float val) {
619  const auto r = *reinterpret_cast<const float*>(agg) + val;
620  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
621 }
622 
623 extern "C" ALWAYS_INLINE void agg_max_float(int32_t* agg, const float val) {
624  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
625  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
626 }
627 
628 extern "C" ALWAYS_INLINE void agg_min_float(int32_t* agg, const float val) {
629  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
630  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
631 }
632 
633 extern "C" ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
634  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
635 }
636 
637 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t* agg,
638  const float val,
639  const float null_val) {
640  if (val == null_val) {
641  return 0;
642  }
643 
644  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
645  return 0;
646  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
647  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
648  return 0;
649  } else {
650  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
651  return 15;
652  }
653 }
654 
655 extern "C" ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t* agg,
656  const double val,
657  const double skip_val) {
658  if (val != skip_val) {
659  return agg_count_double(agg, val);
660  }
661  return *agg;
662 }
663 
664 extern "C" ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t* agg,
665  const float val,
666  const float skip_val) {
667  if (val != skip_val) {
668  return agg_count_float(agg, val);
669  }
670  return *agg;
671 }
672 
673 #define DEF_SKIP_AGG_ADD(base_agg_func) \
674  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
675  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
676  if (val != skip_val) { \
677  base_agg_func(agg, val); \
678  } \
679  }
680 
681 #define DEF_SKIP_AGG(base_agg_func) \
682  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
683  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
684  if (val != skip_val) { \
685  const ADDR_T old_agg = *agg; \
686  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
687  base_agg_func(agg, val); \
688  } else { \
689  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
690  } \
691  } \
692  }
693 
694 #define DATA_T double
695 #define ADDR_T int64_t
699 #undef ADDR_T
700 #undef DATA_T
701 
702 #define DATA_T float
703 #define ADDR_T int32_t
707 #undef ADDR_T
708 #undef DATA_T
709 
710 #undef DEF_SKIP_AGG_ADD
711 #undef DEF_SKIP_AGG
712 
713 extern "C" ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale) {
714  if (x >= 0) {
715  return x / scale * scale;
716  }
717  if (!(x % scale)) {
718  return x;
719  }
720  return x / scale * scale - scale;
721 }
722 
723 extern "C" ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale) {
724  return decimal_floor(x, scale) + (x % scale ? scale : 0);
725 }
726 
727 // Shared memory aggregators. Should never be called,
728 // real implementations are in cuda_mapd_rt.cu.
729 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
730  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
731  const int64_t val) { \
732  return 0; \
733  } \
734  \
735  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
736  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
737  return 0; \
738  } \
739  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
740  const int32_t val) { \
741  return 0; \
742  } \
743  \
744  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
745  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
746  return 0; \
747  } \
748  \
749  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
750  const double val) { \
751  return 0; \
752  } \
753  \
754  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
755  uint64_t* agg, const double val, const double skip_val) { \
756  return 0; \
757  } \
758  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
759  const float val) { \
760  return 0; \
761  } \
762  \
763  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
764  uint32_t* agg, const float val, const float skip_val) { \
765  return 0; \
766  }
767 
768 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
769  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
770  \
771  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
772  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
773  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
774  const int32_t val) {} \
775  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
776  const int16_t val) {} \
777  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
778  const int8_t val) {} \
779  \
780  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
781  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
782  \
783  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
784  const double val) {} \
785  \
786  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
787  int64_t* agg, const double val, const double skip_val) {} \
788  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
789  const float val) {} \
790  \
791  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
792  int32_t* agg, const float val, const float skip_val) {}
793 
798 
799 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
800  const int64_t val,
801  const int64_t null_val) {
802  return 0;
803 }
804 
805 extern "C" GPU_RT_STUB int32_t
807  const int32_t val,
808  const int32_t null_val) {
809  return 0;
810 }
811 extern "C" GPU_RT_STUB int32_t
813  const int16_t val,
814  const int16_t null_val) {
815  return 0;
816 }
817 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
818  const int8_t val,
819  const int8_t null_val) {
820  return 0;
821 }
822 
823 extern "C" GPU_RT_STUB int32_t
825  const double val,
826  const double null_val) {
827  return 0;
828 }
829 
830 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
831  const float val,
832  const float null_val) {
833  return 0;
834 }
835 
836 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
837  const int16_t val,
838  const int16_t skip_val) {}
839 
840 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
841  const int8_t val,
842  const int8_t skip_val) {}
843 
844 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
845  const int16_t val,
846  const int16_t skip_val) {}
847 
848 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
849  const int8_t val,
850  const int8_t skip_val) {}
851 
852 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
853 
854 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
855  return 0;
856 }
857 
858 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
859  const int64_t val,
860  const int64_t skip_val) {
861  return 0;
862 }
863 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
864  return 0;
865 }
866 
867 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
868  const int32_t val,
869  const int32_t skip_val) {
870  return 0;
871 }
872 
873 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
874 
875 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
876  const double val,
877  const double skip_val) {}
878 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
879 
880 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
881  const float val,
882  const float skip_val) {}
883 
884 extern "C" GPU_RT_STUB void force_sync() {}
885 
886 extern "C" GPU_RT_STUB void sync_warp() {}
887 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
888 extern "C" GPU_RT_STUB void sync_threadblock() {}
889 
890 extern "C" GPU_RT_STUB void write_back_non_grouped_agg(int64_t* input_buffer,
891  int64_t* output_buffer,
892  const int32_t num_agg_cols){};
893 // x64 stride functions
894 
895 extern "C" __attribute__((noinline)) int32_t pos_start_impl(int32_t* error_code) {
896  int32_t row_index_resume{0};
897  if (error_code) {
898  row_index_resume = error_code[0];
899  error_code[0] = 0;
900  }
901  return row_index_resume;
902 }
903 
904 extern "C" __attribute__((noinline)) int32_t group_buff_idx_impl() {
905  return pos_start_impl(nullptr);
906 }
907 
908 extern "C" __attribute__((noinline)) int32_t pos_step_impl() {
909  return 1;
910 }
911 
912 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
913  return 0;
914 }
915 
916 extern "C" GPU_RT_STUB int64_t get_thread_index() {
917  return 0;
918 }
919 
921  return nullptr;
922 }
923 
924 extern "C" GPU_RT_STUB int64_t get_block_index() {
925  return 0;
926 }
927 
928 #undef GPU_RT_STUB
929 
930 extern "C" ALWAYS_INLINE int32_t record_error_code(const int32_t err_code,
931  int32_t* error_codes) {
932  // NB: never override persistent error codes (with code greater than zero).
933  // On GPU, a projection query with a limit can run out of slots without it
934  // being an actual error if the limit has been hit. If a persistent error
935  // (division by zero, for example) occurs before running out of slots, we
936  // have to avoid overriding it, because there's a risk that the query would
937  // go through if we override with a potentially benign out-of-slots code.
938  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
939  error_codes[pos_start_impl(nullptr)] = err_code;
940  }
941  return err_code;
942 }
943 
944 // group by helpers
945 
946 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_nop(
947  const int64_t* groups_buffer,
948  const int32_t groups_buffer_size) {
950 }
951 
952 extern "C" __attribute__((noinline)) void write_back_nop(int64_t* dest,
953  int64_t* src,
954  const int32_t sz) {
955  // the body is not really needed, just make sure the call is not optimized away
956  assert(dest);
957 }
958 
959 extern "C" int64_t* init_shared_mem(const int64_t* global_groups_buffer,
960  const int32_t groups_buffer_size) {
961  return nullptr;
962 }
963 
964 extern "C" __attribute__((noinline)) void init_group_by_buffer_gpu(
965  int64_t* groups_buffer,
966  const int64_t* init_vals,
967  const uint32_t groups_buffer_entry_count,
968  const uint32_t key_qw_count,
969  const uint32_t agg_col_count,
970  const bool keyless,
971  const int8_t warp_size) {
972  // the body is not really needed, just make sure the call is not optimized away
973  assert(groups_buffer);
974 }
975 
976 extern "C" __attribute__((noinline)) void init_columnar_group_by_buffer_gpu(
977  int64_t* groups_buffer,
978  const int64_t* init_vals,
979  const uint32_t groups_buffer_entry_count,
980  const uint32_t key_qw_count,
981  const uint32_t agg_col_count,
982  const bool keyless,
983  const bool blocks_share_memory,
984  const int32_t frag_idx) {
985  // the body is not really needed, just make sure the call is not optimized away
986  assert(groups_buffer);
987 }
988 
989 extern "C" __attribute__((noinline)) void init_group_by_buffer_impl(
990  int64_t* groups_buffer,
991  const int64_t* init_vals,
992  const uint32_t groups_buffer_entry_count,
993  const uint32_t key_qw_count,
994  const uint32_t agg_col_count,
995  const bool keyless,
996  const int8_t warp_size) {
997  // the body is not really needed, just make sure the call is not optimized away
998  assert(groups_buffer);
999 }
1000 
1001 template <typename T>
1003  const uint32_t h,
1004  const T* key,
1005  const uint32_t key_count,
1006  const uint32_t row_size_quad) {
1007  auto off = h * row_size_quad;
1008  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1009  if (*row_ptr == get_empty_key<T>()) {
1010  memcpy(row_ptr, key, key_count * sizeof(T));
1011  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1012  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1013  }
1014  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1015  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1016  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1017  }
1018  return nullptr;
1019 }
1020 
1022  const uint32_t h,
1023  const int64_t* key,
1024  const uint32_t key_count,
1025  const uint32_t key_width,
1026  const uint32_t row_size_quad,
1027  const int64_t* init_vals) {
1028  switch (key_width) {
1029  case 4:
1030  return get_matching_group_value(groups_buffer,
1031  h,
1032  reinterpret_cast<const int32_t*>(key),
1033  key_count,
1034  row_size_quad);
1035  case 8:
1036  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1037  default:;
1038  }
1039  return nullptr;
1040 }
1041 
1042 template <typename T>
1044  const uint32_t entry_count,
1045  const uint32_t h,
1046  const T* key,
1047  const uint32_t key_count) {
1048  auto off = h;
1049  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1050  if (key_buffer[off] == get_empty_key<T>()) {
1051  for (size_t i = 0; i < key_count; ++i) {
1052  key_buffer[off] = key[i];
1053  off += entry_count;
1054  }
1055  return h;
1056  }
1057  off = h;
1058  for (size_t i = 0; i < key_count; ++i) {
1059  if (key_buffer[off] != key[i]) {
1060  return -1;
1061  }
1062  off += entry_count;
1063  }
1064  return h;
1065 }
1066 
1067 extern "C" ALWAYS_INLINE int32_t
1069  const uint32_t entry_count,
1070  const uint32_t h,
1071  const int64_t* key,
1072  const uint32_t key_count,
1073  const uint32_t key_width) {
1074  switch (key_width) {
1075  case 4:
1076  return get_matching_group_value_columnar_slot(groups_buffer,
1077  entry_count,
1078  h,
1079  reinterpret_cast<const int32_t*>(key),
1080  key_count);
1081  case 8:
1083  groups_buffer, entry_count, h, key, key_count);
1084  default:
1085  return -1;
1086  }
1087  return -1;
1088 }
1089 
1091  int64_t* groups_buffer,
1092  const uint32_t h,
1093  const int64_t* key,
1094  const uint32_t key_qw_count,
1095  const size_t entry_count) {
1096  auto off = h;
1097  if (groups_buffer[off] == EMPTY_KEY_64) {
1098  for (size_t i = 0; i < key_qw_count; ++i) {
1099  groups_buffer[off] = key[i];
1100  off += entry_count;
1101  }
1102  return &groups_buffer[off];
1103  }
1104  off = h;
1105  for (size_t i = 0; i < key_qw_count; ++i) {
1106  if (groups_buffer[off] != key[i]) {
1107  return nullptr;
1108  }
1109  off += entry_count;
1110  }
1111  return &groups_buffer[off];
1112 }
1113 
1114 /*
1115  * For a particular hashed_index, returns the row-wise offset
1116  * to the first matching agg column in memory.
1117  * It also checks the corresponding group column, and initialize all
1118  * available keys if they are not empty (it is assumed all group columns are
1119  * 64-bit wide).
1120  *
1121  * Memory layout:
1122  *
1123  * | prepended group columns (64-bit each) | agg columns |
1124  */
1126  int64_t* groups_buffer,
1127  const uint32_t hashed_index,
1128  const int64_t* key,
1129  const uint32_t key_count,
1130  const uint32_t row_size_quad) {
1131  uint32_t off = hashed_index * row_size_quad;
1132  if (groups_buffer[off] == EMPTY_KEY_64) {
1133  for (uint32_t i = 0; i < key_count; ++i) {
1134  groups_buffer[off + i] = key[i];
1135  }
1136  }
1137  return groups_buffer + off + key_count;
1138 }
1139 
1147  int64_t* groups_buffer,
1148  const uint32_t hashed_index,
1149  const uint32_t row_size_quad) {
1150  return groups_buffer + row_size_quad * hashed_index;
1151 }
1152 
1153 /*
1154  * For a particular hashed_index, find and initialize (if necessary) all the group
1155  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1156  */
1158  int64_t* groups_buffer,
1159  const uint32_t hashed_index,
1160  const int64_t* key,
1161  const uint32_t key_count,
1162  const uint32_t entry_count) {
1163  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1164  for (uint32_t i = 0; i < key_count; i++) {
1165  groups_buffer[i * entry_count + hashed_index] = key[i];
1166  }
1167  }
1168 }
1169 
1170 #include "GroupByRuntime.cpp"
1172 
1174  int64_t* groups_buffer,
1175  const int64_t key,
1176  const int64_t min_key,
1177  const int64_t /* bucket */,
1178  const uint32_t row_size_quad) {
1179  return groups_buffer + row_size_quad * (key - min_key);
1180 }
1181 
1183  int64_t* groups_buffer,
1184  const int64_t key,
1185  const int64_t min_key,
1186  const int64_t /* bucket */,
1187  const uint32_t row_size_quad,
1188  const uint8_t thread_warp_idx,
1189  const uint8_t warp_size) {
1190  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1191 }
1192 
1193 extern "C" ALWAYS_INLINE int8_t* extract_str_ptr(const uint64_t str_and_len) {
1194  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1195 }
1196 
1197 extern "C" ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len) {
1198  return static_cast<int64_t>(str_and_len) >> 48;
1199 }
1200 
1201 extern "C" __attribute__((noinline)) int8_t* extract_str_ptr_noinline(
1202  const uint64_t str_and_len) {
1203  return extract_str_ptr(str_and_len);
1204 }
1205 
1206 extern "C" __attribute__((noinline)) int32_t extract_str_len_noinline(
1207  const uint64_t str_and_len) {
1208  return extract_str_len(str_and_len);
1209 }
1210 
1211 extern "C" ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr, const int32_t len) {
1212  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1213  (static_cast<const uint64_t>(len) << 48);
1214 }
1215 
1216 #ifdef __clang__
1217 #include "../Utils/StringLike.cpp"
1218 #endif
1219 
1220 #ifndef __CUDACC__
1221 #include "TopKRuntime.cpp"
1222 #endif
1223 
1224 extern "C" ALWAYS_INLINE DEVICE int32_t char_length(const char* str,
1225  const int32_t str_len) {
1226  return str_len;
1227 }
1228 
1229 extern "C" ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char* str,
1230  const int32_t str_len,
1231  const int32_t int_null) {
1232  if (!str) {
1233  return int_null;
1234  }
1235  return str_len;
1236 }
1237 
1238 extern "C" ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id) {
1239  return str_id;
1240 }
1241 
1242 extern "C" ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion,
1243  const int64_t row_offset) {
1244  const int64_t threshold = 4294967296 * proportion;
1245  return (row_offset * 2654435761) % 4294967296 < threshold;
1246 }
1247 
1248 extern "C" ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff,
1249  const int64_t pos) {
1250  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1251 }
1252 
1253 extern "C" ALWAYS_INLINE double percent_window_func(const int64_t output_buff,
1254  const int64_t pos) {
1255  return reinterpret_cast<const double*>(output_buff)[pos];
1256 }
1257 
1258 extern "C" ALWAYS_INLINE double load_double(const int64_t* agg) {
1259  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1260 }
1261 
1262 extern "C" ALWAYS_INLINE float load_float(const int32_t* agg) {
1263  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1264 }
1265 
1266 extern "C" ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1267  const int64_t* count,
1268  const double null_val) {
1269  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1270 }
1271 
1272 extern "C" ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1273  const int64_t* count,
1274  const double null_val,
1275  const uint32_t scale) {
1276  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1277 }
1278 
1279 extern "C" ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1280  const int64_t* count,
1281  const double null_val) {
1282  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1283  : null_val;
1284 }
1285 
1286 extern "C" ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1287  const int32_t* count,
1288  const double null_val) {
1289  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1290  : null_val;
1291 }
1292 
1293 extern "C" NEVER_INLINE void linear_probabilistic_count(uint8_t* bitmap,
1294  const uint32_t bitmap_bytes,
1295  const uint8_t* key_bytes,
1296  const uint32_t key_len) {
1297  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1298  const uint32_t word_idx = bit_pos / 32;
1299  const uint32_t bit_idx = bit_pos % 32;
1300  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1301 }
1302 
1303 extern "C" __attribute__((noinline)) void query_stub_hoisted_literals(
1304  const int8_t** col_buffers,
1305  const int8_t* literals,
1306  const int64_t* num_rows,
1307  const uint64_t* frag_row_offsets,
1308  const int32_t* max_matched,
1309  const int64_t* init_agg_value,
1310  int64_t** out,
1311  uint32_t frag_idx,
1312  const int64_t* join_hash_tables,
1313  int32_t* error_code,
1314  int32_t* total_matched) {
1315  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1316  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1317  total_matched);
1318 }
1319 
1320 extern "C" void multifrag_query_hoisted_literals(const int8_t*** col_buffers,
1321  const uint64_t* num_fragments,
1322  const int8_t* literals,
1323  const int64_t* num_rows,
1324  const uint64_t* frag_row_offsets,
1325  const int32_t* max_matched,
1326  int32_t* total_matched,
1327  const int64_t* init_agg_value,
1328  int64_t** out,
1329  int32_t* error_code,
1330  const uint32_t* num_tables_ptr,
1331  const int64_t* join_hash_tables) {
1332  for (uint32_t i = 0; i < *num_fragments; ++i) {
1333  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1334  literals,
1335  &num_rows[i * (*num_tables_ptr)],
1336  &frag_row_offsets[i * (*num_tables_ptr)],
1337  max_matched,
1338  init_agg_value,
1339  out,
1340  i,
1341  join_hash_tables,
1342  total_matched,
1343  error_code);
1344  }
1345 }
1346 
1347 extern "C" __attribute__((noinline)) void query_stub(const int8_t** col_buffers,
1348  const int64_t* num_rows,
1349  const uint64_t* frag_row_offsets,
1350  const int32_t* max_matched,
1351  const int64_t* init_agg_value,
1352  int64_t** out,
1353  uint32_t frag_idx,
1354  const int64_t* join_hash_tables,
1355  int32_t* error_code,
1356  int32_t* total_matched) {
1357  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1358  out || frag_idx || error_code || join_hash_tables || total_matched);
1359 }
1360 
1361 extern "C" void multifrag_query(const int8_t*** col_buffers,
1362  const uint64_t* num_fragments,
1363  const int64_t* num_rows,
1364  const uint64_t* frag_row_offsets,
1365  const int32_t* max_matched,
1366  int32_t* total_matched,
1367  const int64_t* init_agg_value,
1368  int64_t** out,
1369  int32_t* error_code,
1370  const uint32_t* num_tables_ptr,
1371  const int64_t* join_hash_tables) {
1372  for (uint32_t i = 0; i < *num_fragments; ++i) {
1373  query_stub(col_buffers ? col_buffers[i] : nullptr,
1374  &num_rows[i * (*num_tables_ptr)],
1375  &frag_row_offsets[i * (*num_tables_ptr)],
1376  max_matched,
1377  init_agg_value,
1378  out,
1379  i,
1380  join_hash_tables,
1381  total_matched,
1382  error_code);
1383  }
1384 }
1385 
1387  if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
1388  return true;
1389  }
1390  return false;
1391 }
1392 
1393 extern "C" bool check_interrupt_init(unsigned command) {
1394  static std::atomic_bool runtime_interrupt_flag{false};
1395 
1396  if (command == static_cast<unsigned>(INT_CHECK)) {
1397  if (runtime_interrupt_flag.load()) {
1398  return true;
1399  }
1400  return false;
1401  }
1402  if (command == static_cast<unsigned>(INT_ABORT)) {
1403  runtime_interrupt_flag.store(true);
1404  return false;
1405  }
1406  if (command == static_cast<unsigned>(INT_RESET)) {
1407  runtime_interrupt_flag.store(false);
1408  return false;
1409  }
1410  return false;
1411 }
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
int64_t * src
#define DEF_UMINUS_NULLABLE(type, null_type)
const int32_t groups_buffer_size return groups_buffer
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t * join_hash_tables
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
#define EMPTY_KEY_64
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:50
#define GPU_RT_STUB
const int8_t const int64_t * num_rows
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:40
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
__device__ int64_t get_thread_index()
Definition: cuda_mapd_rt.cu:20
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:36
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:44
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
const int64_t const uint32_t groups_buffer_entry_count
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int64_t * declare_dynamic_shared_memory()
Definition: cuda_mapd_rt.cu:57
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
#define DEVICE
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
Definition: cuda_mapd_rt.cu:24
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
int32_t extract_str_len_noinline(const uint64_t str_and_len)
void agg_min_int16(int16_t *agg, const int16_t val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:28
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:96
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
const int8_t const int64_t const uint64_t const int32_t * max_matched
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
int64_t const int32_t sz assert(dest)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t frag_idx
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:67
const int8_t const int64_t const uint64_t const int32_t const int64_t * init_agg_value
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
bool check_interrupt_init(unsigned command)
#define NEVER_INLINE
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
const int8_t * literals
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool blocks_share_memory
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
ALWAYS_INLINE int32_t record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
const int8_t const int64_t const uint64_t * frag_row_offsets
__device__ void sync_threadblock()
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
void agg_max_int8(int8_t *agg, const int8_t val)
__attribute__((noinline)) int32_t pos_start_impl(int32_t *error_code)
const int64_t * init_vals
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:32