OmniSciDB  addbbd5075
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "TypePunning.h"
27 
28 #include <algorithm>
29 #include <chrono>
30 #include <cmath>
31 #include <cstring>
32 #include <thread>
33 #include <tuple>
34 
35 // decoder implementations
36 
37 #include "DecodersImpl.h"
38 
39 // arithmetic operator implementations
40 
41 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
42  extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
43  const type lhs, const type rhs, const null_type null_val) { \
44  if (lhs != null_val && rhs != null_val) { \
45  return lhs opsym rhs; \
46  } \
47  return null_val; \
48  }
49 
50 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
51  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
52  const type lhs, const type rhs, const null_type null_val) { \
53  if (lhs != null_val) { \
54  return lhs opsym rhs; \
55  } \
56  return null_val; \
57  }
58 
59 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
60  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
61  const type lhs, const type rhs, const null_type null_val) { \
62  if (rhs != null_val) { \
63  return lhs opsym rhs; \
64  } \
65  return null_val; \
66  }
67 
68 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
69  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
70  const type lhs, \
71  const type rhs, \
72  const null_type null_val, \
73  const int8_t null_bool_val) { \
74  if (lhs != null_val && rhs != null_val) { \
75  return lhs opsym rhs; \
76  } \
77  return null_bool_val; \
78  }
79 
80 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
81  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
82  const type lhs, \
83  const type rhs, \
84  const null_type null_val, \
85  const int8_t null_bool_val) { \
86  if (lhs != null_val) { \
87  return lhs opsym rhs; \
88  } \
89  return null_bool_val; \
90  }
91 
92 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
93  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
94  const type lhs, \
95  const type rhs, \
96  const null_type null_val, \
97  const int8_t null_bool_val) { \
98  if (rhs != null_val) { \
99  return lhs opsym rhs; \
100  } \
101  return null_bool_val; \
102  }
103 
104 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
105  extern "C" ALWAYS_INLINE type safe_div_##type( \
106  const type lhs, const type rhs, const null_type null_val) { \
107  if (lhs != null_val && rhs != null_val && rhs != 0) { \
108  return lhs / rhs; \
109  } \
110  return null_val; \
111  }
112 
113 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
114  DEF_ARITH_NULLABLE(type, null_type, add, +) \
115  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
116  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
117  DEF_ARITH_NULLABLE(type, null_type, div, /) \
118  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
119  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
120  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
121  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
123  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
124  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
125  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
127  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
128  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
129  DEF_CMP_NULLABLE(type, null_type, lt, <) \
130  DEF_CMP_NULLABLE(type, null_type, gt, >) \
131  DEF_CMP_NULLABLE(type, null_type, le, <=) \
132  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
133  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
134  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
135  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
139  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
140  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
141  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
145 
146 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
147 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
148 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
149 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
150 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
151 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
152 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
153 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
154 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
155 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
156 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
164 
165 #undef DEF_BINARY_NULLABLE_ALL_OPS
166 #undef DEF_SAFE_DIV_NULLABLE
167 #undef DEF_CMP_NULLABLE_RHS
168 #undef DEF_CMP_NULLABLE_LHS
169 #undef DEF_CMP_NULLABLE
170 #undef DEF_ARITH_NULLABLE_RHS
171 #undef DEF_ARITH_NULLABLE_LHS
172 #undef DEF_ARITH_NULLABLE
173 
174 extern "C" ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand,
175  const uint64_t scale,
176  const int64_t operand_null_val,
177  const int64_t result_null_val) {
178  return operand != operand_null_val ? operand * scale : result_null_val;
179 }
180 
181 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand,
182  const int64_t scale,
183  const int64_t null_val) {
184  // rounded scale down of a decimal
185  if (operand == null_val) {
186  return null_val;
187  }
188 
189  int64_t tmp = scale >> 1;
190  tmp = operand >= 0 ? operand + tmp : operand - tmp;
191  return tmp / scale;
192 }
193 
194 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand,
195  const int64_t scale,
196  const int64_t null_val) {
197  int64_t tmp = scale >> 1;
198  tmp = operand >= 0 ? operand + tmp : operand - tmp;
199  return tmp / scale;
200 }
201 
202 #define DEF_UMINUS_NULLABLE(type, null_type) \
203  extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
204  const null_type null_val) { \
205  return operand == null_val ? null_val : -operand; \
206  }
207 
208 DEF_UMINUS_NULLABLE(int8_t, int8_t)
209 DEF_UMINUS_NULLABLE(int16_t, int16_t)
210 DEF_UMINUS_NULLABLE(int32_t, int32_t)
211 DEF_UMINUS_NULLABLE(int64_t, int64_t)
212 DEF_UMINUS_NULLABLE(float, float)
213 DEF_UMINUS_NULLABLE(double, double)
214 
215 #undef DEF_UMINUS_NULLABLE
216 
217 #define DEF_CAST_NULLABLE(from_type, to_type) \
218  extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
219  const from_type operand, \
220  const from_type from_null_val, \
221  const to_type to_null_val) { \
222  return operand == from_null_val ? to_null_val : operand; \
223  }
224 
225 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
226  DEF_CAST_NULLABLE(type1, type2) \
227  DEF_CAST_NULLABLE(type2, type1)
228 
229 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
230 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
231 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
232 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
233 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
234 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
235 DEF_CAST_NULLABLE_BIDIR(float, double)
236 DEF_CAST_NULLABLE_BIDIR(float, int8_t)
237 DEF_CAST_NULLABLE_BIDIR(float, int16_t)
238 DEF_CAST_NULLABLE_BIDIR(float, int32_t)
239 DEF_CAST_NULLABLE_BIDIR(float, int64_t)
240 DEF_CAST_NULLABLE_BIDIR(double, int8_t)
241 DEF_CAST_NULLABLE_BIDIR(double, int16_t)
242 DEF_CAST_NULLABLE_BIDIR(double, int32_t)
243 DEF_CAST_NULLABLE_BIDIR(double, int64_t)
244 DEF_CAST_NULLABLE(uint8_t, int32_t)
245 DEF_CAST_NULLABLE(uint16_t, int32_t)
246 
247 #undef DEF_CAST_NULLABLE_BIDIR
248 #undef DEF_CAST_NULLABLE
249 
250 extern "C" ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val) {
251  return operand == null_val ? operand : (operand ? 0 : 1);
252 }
253 
254 extern "C" ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
255  const int8_t rhs,
256  const int8_t null_val) {
257  if (lhs == null_val) {
258  return rhs == 0 ? rhs : null_val;
259  }
260  if (rhs == null_val) {
261  return lhs == 0 ? lhs : null_val;
262  }
263  return (lhs && rhs) ? 1 : 0;
264 }
265 
266 extern "C" ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
267  const int8_t rhs,
268  const int8_t null_val) {
269  if (lhs == null_val) {
270  return rhs == 0 ? null_val : rhs;
271  }
272  if (rhs == null_val) {
273  return lhs == 0 ? null_val : lhs;
274  }
275  return (lhs || rhs) ? 1 : 0;
276 }
277 
278 // aggregator implementations
279 
280 extern "C" ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
281  return (*agg)++;
282 }
283 
284 extern "C" ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t* agg,
285  const int64_t val,
286  const int64_t min_val) {
287  const uint64_t bitmap_idx = val - min_val;
288  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
289 }
290 
291 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
292 
294  const int64_t,
295  const int64_t,
296  const int64_t,
297  const int64_t,
298  const uint64_t,
299  const uint64_t) {}
300 
301 extern "C" NEVER_INLINE void agg_approximate_count_distinct(int64_t* agg,
302  const int64_t key,
303  const uint32_t b) {
304  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
305  const uint32_t index = hash >> (64 - b);
306  const uint8_t rank = get_rank(hash << b, 64 - b);
307  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
308  M[index] = std::max(M[index], rank);
309 }
310 
312  const int64_t,
313  const uint32_t,
314  const int64_t,
315  const int64_t) {}
316 
317 extern "C" ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
318  const int64_t val,
319  const int64_t min_val,
320  const int64_t max_val,
321  const int64_t null_val,
322  const int8_t null_bool_val) {
323  if (val == null_val) {
324  return null_bool_val;
325  }
326  if (val < min_val || val > max_val) {
327  return 0;
328  }
329  if (!bitset) {
330  return 0;
331  }
332  const uint64_t bitmap_idx = val - min_val;
333  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
334  (1 << (bitmap_idx & 7))
335  ? 1
336  : 0;
337 }
338 
339 extern "C" ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
340  const auto old = *agg;
341  *agg += val;
342  return old;
343 }
344 
345 extern "C" ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
346  *agg = std::max(*agg, val);
347 }
348 
349 extern "C" ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
350  *agg = std::min(*agg, val);
351 }
352 
353 extern "C" ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
354  *agg = val;
355 }
356 
357 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id(int64_t* agg,
358  const int64_t val,
359  const int64_t null_val) {
360  if (val == null_val) {
361  return 0;
362  }
363 
364  if (*agg == val) {
365  return 0;
366  } else if (*agg == null_val) {
367  *agg = val;
368  return 0;
369  } else {
370  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
371  return 15;
372  }
373 }
374 
376  const int64_t val,
377  const int64_t min_val,
378  const int64_t skip_val) {
379  if (val != skip_val) {
380  agg_count_distinct_bitmap(agg, val, min_val);
381  }
382 }
383 
385  const int64_t,
386  const int64_t,
387  const int64_t,
388  const int64_t,
389  const int64_t,
390  const uint64_t,
391  const uint64_t) {}
392 
393 extern "C" ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg, const int32_t) {
394  return (*agg)++;
395 }
396 
397 extern "C" ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg, const int32_t val) {
398  const auto old = *agg;
399  *agg += val;
400  return old;
401 }
402 
403 #define DEF_AGG_MAX_INT(n) \
404  extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
405  *agg = std::max(*agg, val); \
406  }
407 
408 DEF_AGG_MAX_INT(32)
409 DEF_AGG_MAX_INT(16)
411 #undef DEF_AGG_MAX_INT
412 
413 #define DEF_AGG_MIN_INT(n) \
414  extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
415  *agg = std::min(*agg, val); \
416  }
417 
418 DEF_AGG_MIN_INT(32)
419 DEF_AGG_MIN_INT(16)
421 #undef DEF_AGG_MIN_INT
422 
423 #define DEF_AGG_ID_INT(n) \
424  extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
425  *agg = val; \
426  }
427 
428 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
429  extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
430  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
431  if (val == null_val) { \
432  return 0; \
433  } \
434  if (*agg == val) { \
435  return 0; \
436  } else if (*agg == null_val) { \
437  *agg = val; \
438  return 0; \
439  } else { \
440  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
441  return 15; \
442  } \
443  }
444 
445 DEF_AGG_ID_INT(32)
446 DEF_AGG_ID_INT(16)
448 
452 
453 #undef DEF_AGG_ID_INT
454 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
455 
456 #define DEF_WRITE_PROJECTION_INT(n) \
457  extern "C" ALWAYS_INLINE void write_projection_int##n( \
458  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
459  if (val != init_val) { \
460  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
461  } \
462  }
463 
466 #undef DEF_WRITE_PROJECTION_INT
467 
468 extern "C" ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
469  const int64_t val,
470  const int64_t skip_val) {
471  const auto old = *agg;
472  if (val != skip_val) {
473  if (old != skip_val) {
474  return agg_sum(agg, val);
475  } else {
476  *agg = val;
477  }
478  }
479  return old;
480 }
481 
482 extern "C" ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t* agg,
483  const int32_t val,
484  const int32_t skip_val) {
485  const auto old = *agg;
486  if (val != skip_val) {
487  if (old != skip_val) {
488  return agg_sum_int32(agg, val);
489  } else {
490  *agg = val;
491  }
492  }
493  return old;
494 }
495 
496 extern "C" ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t* agg,
497  const int64_t val,
498  const int64_t skip_val) {
499  if (val != skip_val) {
500  return agg_count(agg, val);
501  }
502  return *agg;
503 }
504 
505 extern "C" ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t* agg,
506  const int32_t val,
507  const int32_t skip_val) {
508  if (val != skip_val) {
509  return agg_count_int32(agg, val);
510  }
511  return *agg;
512 }
513 
514 #define DEF_SKIP_AGG_ADD(base_agg_func) \
515  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
516  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
517  if (val != skip_val) { \
518  base_agg_func(agg, val); \
519  } \
520  }
521 
522 #define DEF_SKIP_AGG(base_agg_func) \
523  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
524  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
525  if (val != skip_val) { \
526  const DATA_T old_agg = *agg; \
527  if (old_agg != skip_val) { \
528  base_agg_func(agg, val); \
529  } else { \
530  *agg = val; \
531  } \
532  } \
533  }
534 
535 #define DATA_T int64_t
538 #undef DATA_T
539 
540 #define DATA_T int32_t
543 #undef DATA_T
544 
545 #define DATA_T int16_t
548 #undef DATA_T
549 
550 #define DATA_T int8_t
553 #undef DATA_T
554 
555 #undef DEF_SKIP_AGG_ADD
556 #undef DEF_SKIP_AGG
557 
558 // TODO(alex): fix signature
559 
560 extern "C" ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg, const double val) {
561  return (*agg)++;
562 }
563 
564 extern "C" ALWAYS_INLINE void agg_sum_double(int64_t* agg, const double val) {
565  const auto r = *reinterpret_cast<const double*>(agg) + val;
566  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
567 }
568 
569 extern "C" ALWAYS_INLINE void agg_max_double(int64_t* agg, const double val) {
570  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
571  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
572 }
573 
574 extern "C" ALWAYS_INLINE void agg_min_double(int64_t* agg, const double val) {
575  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
576  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
577 }
578 
579 extern "C" ALWAYS_INLINE void agg_id_double(int64_t* agg, const double val) {
580  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
581 }
582 
583 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t* agg,
584  const double val,
585  const double null_val) {
586  if (val == null_val) {
587  return 0;
588  }
589 
590  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
591  return 0;
592  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
593  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
594  return 0;
595  } else {
596  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
597  return 15;
598  }
599 }
600 
601 extern "C" ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg, const float val) {
602  return (*agg)++;
603 }
604 
605 extern "C" ALWAYS_INLINE void agg_sum_float(int32_t* agg, const float val) {
606  const auto r = *reinterpret_cast<const float*>(agg) + val;
607  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
608 }
609 
610 extern "C" ALWAYS_INLINE void agg_max_float(int32_t* agg, const float val) {
611  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
612  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
613 }
614 
615 extern "C" ALWAYS_INLINE void agg_min_float(int32_t* agg, const float val) {
616  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
617  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
618 }
619 
620 extern "C" ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
621  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
622 }
623 
624 extern "C" ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t* agg,
625  const float val,
626  const float null_val) {
627  if (val == null_val) {
628  return 0;
629  }
630 
631  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
632  return 0;
633  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
634  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
635  return 0;
636  } else {
637  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
638  return 15;
639  }
640 }
641 
642 extern "C" ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t* agg,
643  const double val,
644  const double skip_val) {
645  if (val != skip_val) {
646  return agg_count_double(agg, val);
647  }
648  return *agg;
649 }
650 
651 extern "C" ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t* agg,
652  const float val,
653  const float skip_val) {
654  if (val != skip_val) {
655  return agg_count_float(agg, val);
656  }
657  return *agg;
658 }
659 
660 #define DEF_SKIP_AGG_ADD(base_agg_func) \
661  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
662  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
663  if (val != skip_val) { \
664  base_agg_func(agg, val); \
665  } \
666  }
667 
668 #define DEF_SKIP_AGG(base_agg_func) \
669  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
670  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
671  if (val != skip_val) { \
672  const ADDR_T old_agg = *agg; \
673  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
674  base_agg_func(agg, val); \
675  } else { \
676  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
677  } \
678  } \
679  }
680 
681 #define DATA_T double
682 #define ADDR_T int64_t
686 #undef ADDR_T
687 #undef DATA_T
688 
689 #define DATA_T float
690 #define ADDR_T int32_t
694 #undef ADDR_T
695 #undef DATA_T
696 
697 #undef DEF_SKIP_AGG_ADD
698 #undef DEF_SKIP_AGG
699 
700 extern "C" ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale) {
701  if (x >= 0) {
702  return x / scale * scale;
703  }
704  if (!(x % scale)) {
705  return x;
706  }
707  return x / scale * scale - scale;
708 }
709 
710 extern "C" ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale) {
711  return decimal_floor(x, scale) + (x % scale ? scale : 0);
712 }
713 
714 // Shared memory aggregators. Should never be called,
715 // real implementations are in cuda_mapd_rt.cu.
716 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
717  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
718  const int64_t val) { \
719  return 0; \
720  } \
721  \
722  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
723  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
724  return 0; \
725  } \
726  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
727  const int32_t val) { \
728  return 0; \
729  } \
730  \
731  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
732  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
733  return 0; \
734  } \
735  \
736  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
737  const double val) { \
738  return 0; \
739  } \
740  \
741  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
742  uint64_t* agg, const double val, const double skip_val) { \
743  return 0; \
744  } \
745  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
746  const float val) { \
747  return 0; \
748  } \
749  \
750  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
751  uint32_t* agg, const float val, const float skip_val) { \
752  return 0; \
753  }
754 
755 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
756  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
757  \
758  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
759  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
760  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
761  const int32_t val) {} \
762  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
763  const int16_t val) {} \
764  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
765  const int8_t val) {} \
766  \
767  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
768  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
769  \
770  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
771  const double val) {} \
772  \
773  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
774  int64_t* agg, const double val, const double skip_val) {} \
775  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
776  const float val) {} \
777  \
778  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
779  int32_t* agg, const float val, const float skip_val) {}
780 
785 
786 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
787  const int64_t val,
788  const int64_t null_val) {
789  return 0;
790 }
791 
792 extern "C" GPU_RT_STUB int32_t
794  const int32_t val,
795  const int32_t null_val) {
796  return 0;
797 }
798 extern "C" GPU_RT_STUB int32_t
800  const int16_t val,
801  const int16_t null_val) {
802  return 0;
803 }
804 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
805  const int8_t val,
806  const int8_t null_val) {
807  return 0;
808 }
809 
810 extern "C" GPU_RT_STUB int32_t
812  const double val,
813  const double null_val) {
814  return 0;
815 }
816 
817 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
818  const float val,
819  const float null_val) {
820  return 0;
821 }
822 
823 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
824  const int16_t val,
825  const int16_t skip_val) {}
826 
827 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
828  const int8_t val,
829  const int8_t skip_val) {}
830 
831 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
832  const int16_t val,
833  const int16_t skip_val) {}
834 
835 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
836  const int8_t val,
837  const int8_t skip_val) {}
838 
839 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
840 
841 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
842  return 0;
843 }
844 
845 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
846  const int64_t val,
847  const int64_t skip_val) {
848  return 0;
849 }
850 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
851  return 0;
852 }
853 
854 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
855  const int32_t val,
856  const int32_t skip_val) {
857  return 0;
858 }
859 
860 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
861 
862 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
863  const double val,
864  const double skip_val) {}
865 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
866 
867 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
868  const float val,
869  const float skip_val) {}
870 
871 extern "C" GPU_RT_STUB void force_sync() {}
872 
873 extern "C" GPU_RT_STUB void sync_warp() {}
874 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
875 
876 // x64 stride functions
877 
878 extern "C" __attribute__((noinline)) int32_t pos_start_impl(int32_t* error_code) {
879  int32_t row_index_resume{0};
880  if (error_code) {
881  row_index_resume = error_code[0];
882  error_code[0] = 0;
883  }
884  return row_index_resume;
885 }
886 
887 extern "C" __attribute__((noinline)) int32_t group_buff_idx_impl() {
888  return pos_start_impl(nullptr);
889 }
890 
891 extern "C" __attribute__((noinline)) int32_t pos_step_impl() {
892  return 1;
893 }
894 
895 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
896  return 0;
897 }
898 
899 #undef GPU_RT_STUB
900 
901 extern "C" ALWAYS_INLINE int32_t record_error_code(const int32_t err_code,
902  int32_t* error_codes) {
903  // NB: never override persistent error codes (with code greater than zero).
904  // On GPU, a projection query with a limit can run out of slots without it
905  // being an actual error if the limit has been hit. If a persistent error
906  // (division by zero, for example) occurs before running out of slots, we
907  // have to avoid overriding it, because there's a risk that the query would
908  // go through if we override with a potentially benign out-of-slots code.
909  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
910  error_codes[pos_start_impl(nullptr)] = err_code;
911  }
912  return err_code;
913 }
914 
915 // group by helpers
916 
917 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_nop(
918  const int64_t* groups_buffer,
919  const int32_t groups_buffer_size) {
921 }
922 
923 extern "C" __attribute__((noinline)) void write_back_nop(int64_t* dest,
924  int64_t* src,
925  const int32_t sz) {
926  // the body is not really needed, just make sure the call is not optimized away
927  assert(dest);
928 }
929 
930 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem(
931  const int64_t* groups_buffer,
932  const int32_t groups_buffer_size) {
933  return init_shared_mem_nop(groups_buffer, groups_buffer_size);
934 }
935 
936 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_dynamic(
937  const int64_t* groups_buffer,
938  const int32_t groups_buffer_size) {
939  return nullptr;
940 }
941 
942 extern "C" __attribute__((noinline)) void write_back(int64_t* dest,
943  int64_t* src,
944  const int32_t sz) {
945  write_back_nop(dest, src, sz);
946 }
947 
948 extern "C" __attribute__((noinline)) void write_back_smem_nop(int64_t* dest,
949  int64_t* src,
950  const int32_t sz) {
951  assert(dest);
952 }
953 
954 extern "C" __attribute__((noinline)) void agg_from_smem_to_gmem_nop(int64_t* dest,
955  int64_t* src,
956  const int32_t sz) {
957  assert(dest);
958 }
959 
960 extern "C" __attribute__((noinline)) void
961 agg_from_smem_to_gmem_count_binId(int64_t* dest, int64_t* src, const int32_t sz) {
962  return agg_from_smem_to_gmem_nop(dest, src, sz);
963 }
964 
965 extern "C" __attribute__((noinline)) void
966 agg_from_smem_to_gmem_binId_count(int64_t* dest, int64_t* src, const int32_t sz) {
967  return agg_from_smem_to_gmem_nop(dest, src, sz);
968 }
969 
970 extern "C" __attribute__((noinline)) void init_group_by_buffer_gpu(
971  int64_t* groups_buffer,
972  const int64_t* init_vals,
973  const uint32_t groups_buffer_entry_count,
974  const uint32_t key_qw_count,
975  const uint32_t agg_col_count,
976  const bool keyless,
977  const int8_t warp_size) {
978  // the body is not really needed, just make sure the call is not optimized away
979  assert(groups_buffer);
980 }
981 
982 extern "C" __attribute__((noinline)) void init_columnar_group_by_buffer_gpu(
983  int64_t* groups_buffer,
984  const int64_t* init_vals,
985  const uint32_t groups_buffer_entry_count,
986  const uint32_t key_qw_count,
987  const uint32_t agg_col_count,
988  const bool keyless,
989  const bool blocks_share_memory,
990  const int32_t frag_idx) {
991  // the body is not really needed, just make sure the call is not optimized away
992  assert(groups_buffer);
993 }
994 
995 extern "C" __attribute__((noinline)) void init_group_by_buffer_impl(
996  int64_t* groups_buffer,
997  const int64_t* init_vals,
998  const uint32_t groups_buffer_entry_count,
999  const uint32_t key_qw_count,
1000  const uint32_t agg_col_count,
1001  const bool keyless,
1002  const int8_t warp_size) {
1003  // the body is not really needed, just make sure the call is not optimized away
1004  assert(groups_buffer);
1005 }
1006 
1007 template <typename T>
1009  const uint32_t h,
1010  const T* key,
1011  const uint32_t key_count,
1012  const uint32_t row_size_quad) {
1013  auto off = h * row_size_quad;
1014  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1015  if (*row_ptr == get_empty_key<T>()) {
1016  memcpy(row_ptr, key, key_count * sizeof(T));
1017  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1018  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1019  }
1020  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1021  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1022  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1023  }
1024  return nullptr;
1025 }
1026 
1028  const uint32_t h,
1029  const int64_t* key,
1030  const uint32_t key_count,
1031  const uint32_t key_width,
1032  const uint32_t row_size_quad,
1033  const int64_t* init_vals) {
1034  switch (key_width) {
1035  case 4:
1036  return get_matching_group_value(groups_buffer,
1037  h,
1038  reinterpret_cast<const int32_t*>(key),
1039  key_count,
1040  row_size_quad);
1041  case 8:
1042  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1043  default:;
1044  }
1045  return nullptr;
1046 }
1047 
1048 template <typename T>
1050  const uint32_t entry_count,
1051  const uint32_t h,
1052  const T* key,
1053  const uint32_t key_count) {
1054  auto off = h;
1055  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1056  if (key_buffer[off] == get_empty_key<T>()) {
1057  for (size_t i = 0; i < key_count; ++i) {
1058  key_buffer[off] = key[i];
1059  off += entry_count;
1060  }
1061  return h;
1062  }
1063  off = h;
1064  for (size_t i = 0; i < key_count; ++i) {
1065  if (key_buffer[off] != key[i]) {
1066  return -1;
1067  }
1068  off += entry_count;
1069  }
1070  return h;
1071 }
1072 
1073 extern "C" ALWAYS_INLINE int32_t
1075  const uint32_t entry_count,
1076  const uint32_t h,
1077  const int64_t* key,
1078  const uint32_t key_count,
1079  const uint32_t key_width) {
1080  switch (key_width) {
1081  case 4:
1082  return get_matching_group_value_columnar_slot(groups_buffer,
1083  entry_count,
1084  h,
1085  reinterpret_cast<const int32_t*>(key),
1086  key_count);
1087  case 8:
1089  groups_buffer, entry_count, h, key, key_count);
1090  default:
1091  return -1;
1092  }
1093  return -1;
1094 }
1095 
1097  int64_t* groups_buffer,
1098  const uint32_t h,
1099  const int64_t* key,
1100  const uint32_t key_qw_count,
1101  const size_t entry_count) {
1102  auto off = h;
1103  if (groups_buffer[off] == EMPTY_KEY_64) {
1104  for (size_t i = 0; i < key_qw_count; ++i) {
1105  groups_buffer[off] = key[i];
1106  off += entry_count;
1107  }
1108  return &groups_buffer[off];
1109  }
1110  off = h;
1111  for (size_t i = 0; i < key_qw_count; ++i) {
1112  if (groups_buffer[off] != key[i]) {
1113  return nullptr;
1114  }
1115  off += entry_count;
1116  }
1117  return &groups_buffer[off];
1118 }
1119 
1120 /*
1121  * For a particular hashed_index, returns the row-wise offset
1122  * to the first matching agg column in memory.
1123  * It also checks the corresponding group column, and initialize all
1124  * available keys if they are not empty (it is assumed all group columns are
1125  * 64-bit wide).
1126  *
1127  * Memory layout:
1128  *
1129  * | prepended group columns (64-bit each) | agg columns |
1130  */
1132  int64_t* groups_buffer,
1133  const uint32_t hashed_index,
1134  const int64_t* key,
1135  const uint32_t key_count,
1136  const uint32_t row_size_quad) {
1137  uint32_t off = hashed_index * row_size_quad;
1138  if (groups_buffer[off] == EMPTY_KEY_64) {
1139  for (uint32_t i = 0; i < key_count; ++i) {
1140  groups_buffer[off + i] = key[i];
1141  }
1142  }
1143  return groups_buffer + off + key_count;
1144 }
1145 
1146 /*
1147  * For a particular hashed_index, find and initialize (if necessary) all the group
1148  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1149  */
1151  int64_t* groups_buffer,
1152  const uint32_t hashed_index,
1153  const int64_t* key,
1154  const uint32_t key_count,
1155  const uint32_t entry_count) {
1156  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1157  for (uint32_t i = 0; i < key_count; i++) {
1158  groups_buffer[i * entry_count + hashed_index] = key[i];
1159  }
1160  }
1161 }
1162 
1163 #include "GroupByRuntime.cpp"
1165 
1167  int64_t* groups_buffer,
1168  const int64_t key,
1169  const int64_t min_key,
1170  const int64_t /* bucket */,
1171  const uint32_t row_size_quad) {
1172  return groups_buffer + row_size_quad * (key - min_key);
1173 }
1174 
1176  int64_t* groups_buffer,
1177  const int64_t key,
1178  const int64_t min_key,
1179  const int64_t /* bucket */,
1180  const uint32_t row_size_quad,
1181  const uint8_t thread_warp_idx,
1182  const uint8_t warp_size) {
1183  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1184 }
1185 
1186 extern "C" ALWAYS_INLINE int8_t* extract_str_ptr(const uint64_t str_and_len) {
1187  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1188 }
1189 
1190 extern "C" ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len) {
1191  return static_cast<int64_t>(str_and_len) >> 48;
1192 }
1193 
1194 extern "C" __attribute__((noinline)) int8_t* extract_str_ptr_noinline(
1195  const uint64_t str_and_len) {
1196  return extract_str_ptr(str_and_len);
1197 }
1198 
1199 extern "C" __attribute__((noinline)) int32_t extract_str_len_noinline(
1200  const uint64_t str_and_len) {
1201  return extract_str_len(str_and_len);
1202 }
1203 
1204 extern "C" ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr, const int32_t len) {
1205  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1206  (static_cast<const uint64_t>(len) << 48);
1207 }
1208 
1209 #ifdef __clang__
1210 #include "../Utils/StringLike.cpp"
1211 #endif
1212 
1213 #ifndef __CUDACC__
1214 #include "TopKRuntime.cpp"
1215 #endif
1216 
1217 extern "C" ALWAYS_INLINE DEVICE int32_t char_length(const char* str,
1218  const int32_t str_len) {
1219  return str_len;
1220 }
1221 
1222 extern "C" ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char* str,
1223  const int32_t str_len,
1224  const int32_t int_null) {
1225  if (!str) {
1226  return int_null;
1227  }
1228  return str_len;
1229 }
1230 
1231 extern "C" ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id) {
1232  return str_id;
1233 }
1234 
1235 extern "C" ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff,
1236  const int64_t pos) {
1237  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1238 }
1239 
1240 extern "C" ALWAYS_INLINE double percent_window_func(const int64_t output_buff,
1241  const int64_t pos) {
1242  return reinterpret_cast<const double*>(output_buff)[pos];
1243 }
1244 
1245 extern "C" ALWAYS_INLINE double load_double(const int64_t* agg) {
1246  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1247 }
1248 
1249 extern "C" ALWAYS_INLINE float load_float(const int32_t* agg) {
1250  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1251 }
1252 
1253 extern "C" ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1254  const int64_t* count,
1255  const double null_val) {
1256  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1257 }
1258 
1259 extern "C" ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1260  const int64_t* count,
1261  const double null_val,
1262  const uint32_t scale) {
1263  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1264 }
1265 
1266 extern "C" ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1267  const int64_t* count,
1268  const double null_val) {
1269  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1270  : null_val;
1271 }
1272 
1273 extern "C" ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1274  const int32_t* count,
1275  const double null_val) {
1276  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1277  : null_val;
1278 }
1279 
1280 extern "C" NEVER_INLINE void linear_probabilistic_count(uint8_t* bitmap,
1281  const uint32_t bitmap_bytes,
1282  const uint8_t* key_bytes,
1283  const uint32_t key_len) {
1284  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1285  const uint32_t word_idx = bit_pos / 32;
1286  const uint32_t bit_idx = bit_pos % 32;
1287  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1288 }
1289 
1290 extern "C" __attribute__((noinline)) void query_stub_hoisted_literals(
1291  const int8_t** col_buffers,
1292  const int8_t* literals,
1293  const int64_t* num_rows,
1294  const uint64_t* frag_row_offsets,
1295  const int32_t* max_matched,
1296  const int64_t* init_agg_value,
1297  int64_t** out,
1298  uint32_t frag_idx,
1299  const int64_t* join_hash_tables,
1300  int32_t* error_code,
1301  int32_t* total_matched) {
1302  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1303  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1304  total_matched);
1305 }
1306 
1307 extern "C" void multifrag_query_hoisted_literals(const int8_t*** col_buffers,
1308  const uint64_t* num_fragments,
1309  const int8_t* literals,
1310  const int64_t* num_rows,
1311  const uint64_t* frag_row_offsets,
1312  const int32_t* max_matched,
1313  int32_t* total_matched,
1314  const int64_t* init_agg_value,
1315  int64_t** out,
1316  int32_t* error_code,
1317  const uint32_t* num_tables_ptr,
1318  const int64_t* join_hash_tables) {
1319  for (uint32_t i = 0; i < *num_fragments; ++i) {
1320  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1321  literals,
1322  &num_rows[i * (*num_tables_ptr)],
1323  &frag_row_offsets[i * (*num_tables_ptr)],
1324  max_matched,
1325  init_agg_value,
1326  out,
1327  i,
1328  join_hash_tables,
1329  total_matched,
1330  error_code);
1331  }
1332 }
1333 
1334 extern "C" __attribute__((noinline)) void query_stub(const int8_t** col_buffers,
1335  const int64_t* num_rows,
1336  const uint64_t* frag_row_offsets,
1337  const int32_t* max_matched,
1338  const int64_t* init_agg_value,
1339  int64_t** out,
1340  uint32_t frag_idx,
1341  const int64_t* join_hash_tables,
1342  int32_t* error_code,
1343  int32_t* total_matched) {
1344  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1345  out || frag_idx || error_code || join_hash_tables || total_matched);
1346 }
1347 
1348 extern "C" void multifrag_query(const int8_t*** col_buffers,
1349  const uint64_t* num_fragments,
1350  const int64_t* num_rows,
1351  const uint64_t* frag_row_offsets,
1352  const int32_t* max_matched,
1353  int32_t* total_matched,
1354  const int64_t* init_agg_value,
1355  int64_t** out,
1356  int32_t* error_code,
1357  const uint32_t* num_tables_ptr,
1358  const int64_t* join_hash_tables) {
1359  for (uint32_t i = 0; i < *num_fragments; ++i) {
1360  query_stub(col_buffers ? col_buffers[i] : nullptr,
1361  &num_rows[i * (*num_tables_ptr)],
1362  &frag_row_offsets[i * (*num_tables_ptr)],
1363  max_matched,
1364  init_agg_value,
1365  out,
1366  i,
1367  join_hash_tables,
1368  total_matched,
1369  error_code);
1370  }
1371 }
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_from_smem_to_gmem_count_binId(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
#define DEF_UMINUS_NULLABLE(type, null_type)
const int32_t groups_buffer_size return groups_buffer
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t * join_hash_tables
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
#define EMPTY_KEY_64
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:33
#define GPU_RT_STUB
const int8_t const int64_t * num_rows
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
__device__ const int64_t * init_shared_mem(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:36
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:23
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:19
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:27
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
int64_t * src
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
const int64_t const uint32_t groups_buffer_entry_count
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
#define DEVICE
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
__device__ const int64_t * init_shared_mem_dynamic(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:77
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
int32_t extract_str_len_noinline(const uint64_t str_and_len)
void agg_min_int16(int16_t *agg, const int16_t val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:11
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
const int8_t const int64_t const uint64_t const int32_t * max_matched
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
__device__ void agg_from_smem_to_gmem_nop(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
Definition: cuda_mapd_rt.cu:96
ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
int64_t const int32_t sz assert(dest)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t frag_idx
__device__ void write_back(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:85
const int8_t const int64_t const uint64_t const int32_t const int64_t * init_agg_value
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
__device__ void agg_from_smem_to_gmem_binId_count(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
#define NEVER_INLINE
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
const int8_t * literals
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool blocks_share_memory
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
ALWAYS_INLINE int32_t record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
const int8_t const int64_t const uint64_t * frag_row_offsets
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
void agg_max_int8(int8_t *agg, const int8_t val)
__attribute__((noinline)) int32_t pos_start_impl(int32_t *error_code)
const int64_t * init_vals
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
__device__ void write_back_smem_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:92
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:15