OmniSciDB  b24e664e58
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "TypePunning.h"
27 
28 #include <algorithm>
29 #include <chrono>
30 #include <cmath>
31 #include <cstring>
32 #include <thread>
33 #include <tuple>
34 
35 // decoder implementations
36 
37 #include "DecodersImpl.h"
38 
39 // arithmetic operator implementations
40 
41 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
42  extern "C" ALWAYS_INLINE type opname##_##type##_nullable( \
43  const type lhs, const type rhs, const null_type null_val) { \
44  if (lhs != null_val && rhs != null_val) { \
45  return lhs opsym rhs; \
46  } \
47  return null_val; \
48  }
49 
50 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
51  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
52  const type lhs, const type rhs, const null_type null_val) { \
53  if (lhs != null_val) { \
54  return lhs opsym rhs; \
55  } \
56  return null_val; \
57  }
58 
59 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
60  extern "C" ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
61  const type lhs, const type rhs, const null_type null_val) { \
62  if (rhs != null_val) { \
63  return lhs opsym rhs; \
64  } \
65  return null_val; \
66  }
67 
68 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
69  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable( \
70  const type lhs, \
71  const type rhs, \
72  const null_type null_val, \
73  const int8_t null_bool_val) { \
74  if (lhs != null_val && rhs != null_val) { \
75  return lhs opsym rhs; \
76  } \
77  return null_bool_val; \
78  }
79 
80 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
81  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
82  const type lhs, \
83  const type rhs, \
84  const null_type null_val, \
85  const int8_t null_bool_val) { \
86  if (lhs != null_val) { \
87  return lhs opsym rhs; \
88  } \
89  return null_bool_val; \
90  }
91 
92 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
93  extern "C" ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
94  const type lhs, \
95  const type rhs, \
96  const null_type null_val, \
97  const int8_t null_bool_val) { \
98  if (rhs != null_val) { \
99  return lhs opsym rhs; \
100  } \
101  return null_bool_val; \
102  }
103 
104 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
105  extern "C" ALWAYS_INLINE type safe_div_##type( \
106  const type lhs, const type rhs, const null_type null_val) { \
107  if (lhs != null_val && rhs != null_val && rhs != 0) { \
108  return lhs / rhs; \
109  } \
110  return null_val; \
111  }
112 
113 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
114  DEF_ARITH_NULLABLE(type, null_type, add, +) \
115  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
116  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
117  DEF_ARITH_NULLABLE(type, null_type, div, /) \
118  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
119  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
120  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
121  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
123  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
124  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
125  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
127  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
128  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
129  DEF_CMP_NULLABLE(type, null_type, lt, <) \
130  DEF_CMP_NULLABLE(type, null_type, gt, >) \
131  DEF_CMP_NULLABLE(type, null_type, le, <=) \
132  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
133  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
134  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
135  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
139  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
140  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
141  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
145 
146 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
147 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
148 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
149 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
150 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
151 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
152 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
153 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
154 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
155 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
156 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
164 
165 #undef DEF_BINARY_NULLABLE_ALL_OPS
166 #undef DEF_SAFE_DIV_NULLABLE
167 #undef DEF_CMP_NULLABLE_RHS
168 #undef DEF_CMP_NULLABLE_LHS
169 #undef DEF_CMP_NULLABLE
170 #undef DEF_ARITH_NULLABLE_RHS
171 #undef DEF_ARITH_NULLABLE_LHS
172 #undef DEF_ARITH_NULLABLE
173 
174 extern "C" ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand,
175  const uint64_t scale,
176  const int64_t operand_null_val,
177  const int64_t result_null_val) {
178  return operand != operand_null_val ? operand * scale : result_null_val;
179 }
180 
181 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand,
182  const int64_t scale,
183  const int64_t null_val) {
184  // rounded scale down of a decimal
185  if (operand == null_val) {
186  return null_val;
187  }
188 
189  int64_t tmp = scale >> 1;
190  tmp = operand >= 0 ? operand + tmp : operand - tmp;
191  return tmp / scale;
192 }
193 
194 extern "C" ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand,
195  const int64_t scale,
196  const int64_t null_val) {
197  int64_t tmp = scale >> 1;
198  tmp = operand >= 0 ? operand + tmp : operand - tmp;
199  return tmp / scale;
200 }
201 
202 #define DEF_UMINUS_NULLABLE(type, null_type) \
203  extern "C" ALWAYS_INLINE type uminus_##type##_nullable(const type operand, \
204  const null_type null_val) { \
205  return operand == null_val ? null_val : -operand; \
206  }
207 
208 DEF_UMINUS_NULLABLE(int16_t, int16_t)
209 DEF_UMINUS_NULLABLE(int32_t, int32_t)
210 DEF_UMINUS_NULLABLE(int64_t, int64_t)
211 DEF_UMINUS_NULLABLE(float, float)
212 DEF_UMINUS_NULLABLE(double, double)
213 
214 #undef DEF_UMINUS_NULLABLE
215 
216 #define DEF_CAST_NULLABLE(from_type, to_type) \
217  extern "C" ALWAYS_INLINE to_type cast_##from_type##_to_##to_type##_nullable( \
218  const from_type operand, \
219  const from_type from_null_val, \
220  const to_type to_null_val) { \
221  return operand == from_null_val ? to_null_val : operand; \
222  }
223 
224 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
225  DEF_CAST_NULLABLE(type1, type2) \
226  DEF_CAST_NULLABLE(type2, type1)
227 
228 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
229 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
230 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
231 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
232 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
233 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
234 DEF_CAST_NULLABLE_BIDIR(float, double)
235 DEF_CAST_NULLABLE_BIDIR(float, int8_t)
236 DEF_CAST_NULLABLE_BIDIR(float, int16_t)
237 DEF_CAST_NULLABLE_BIDIR(float, int32_t)
238 DEF_CAST_NULLABLE_BIDIR(float, int64_t)
239 DEF_CAST_NULLABLE_BIDIR(double, int8_t)
240 DEF_CAST_NULLABLE_BIDIR(double, int16_t)
241 DEF_CAST_NULLABLE_BIDIR(double, int32_t)
242 DEF_CAST_NULLABLE_BIDIR(double, int64_t)
243 DEF_CAST_NULLABLE(uint8_t, int32_t)
244 DEF_CAST_NULLABLE(uint16_t, int32_t)
245 
246 #undef DEF_CAST_NULLABLE_BIDIR
247 #undef DEF_CAST_NULLABLE
248 
249 extern "C" ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val) {
250  return operand == null_val ? operand : (operand ? 0 : 1);
251 }
252 
253 extern "C" ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
254  const int8_t rhs,
255  const int8_t null_val) {
256  if (lhs == null_val) {
257  return rhs == 0 ? rhs : null_val;
258  }
259  if (rhs == null_val) {
260  return lhs == 0 ? lhs : null_val;
261  }
262  return (lhs && rhs) ? 1 : 0;
263 }
264 
265 extern "C" ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
266  const int8_t rhs,
267  const int8_t null_val) {
268  if (lhs == null_val) {
269  return rhs == 0 ? null_val : rhs;
270  }
271  if (rhs == null_val) {
272  return lhs == 0 ? null_val : lhs;
273  }
274  return (lhs || rhs) ? 1 : 0;
275 }
276 
277 // aggregator implementations
278 
279 extern "C" ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
280  return (*agg)++;
281 }
282 
283 extern "C" ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t* agg,
284  const int64_t val,
285  const int64_t min_val) {
286  const uint64_t bitmap_idx = val - min_val;
287  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
288 }
289 
290 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
291 
293  const int64_t,
294  const int64_t,
295  const int64_t,
296  const int64_t,
297  const uint64_t,
298  const uint64_t) {}
299 
300 extern "C" NEVER_INLINE void agg_approximate_count_distinct(int64_t* agg,
301  const int64_t key,
302  const uint32_t b) {
303  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
304  const uint32_t index = hash >> (64 - b);
305  const uint8_t rank = get_rank(hash << b, 64 - b);
306  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
307  M[index] = std::max(M[index], rank);
308 }
309 
311  const int64_t,
312  const uint32_t,
313  const int64_t,
314  const int64_t) {}
315 
316 extern "C" ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
317  const int64_t val,
318  const int64_t min_val,
319  const int64_t max_val,
320  const int64_t null_val,
321  const int8_t null_bool_val) {
322  if (val == null_val) {
323  return null_bool_val;
324  }
325  if (val < min_val || val > max_val) {
326  return 0;
327  }
328  if (!bitset) {
329  return 0;
330  }
331  const uint64_t bitmap_idx = val - min_val;
332  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
333  (1 << (bitmap_idx & 7))
334  ? 1
335  : 0;
336 }
337 
338 extern "C" ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
339  const auto old = *agg;
340  *agg += val;
341  return old;
342 }
343 
344 extern "C" ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
345  *agg = std::max(*agg, val);
346 }
347 
348 extern "C" ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
349  *agg = std::min(*agg, val);
350 }
351 
352 extern "C" ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
353  *agg = val;
354 }
355 
357  const int64_t val,
358  const int64_t min_val,
359  const int64_t skip_val) {
360  if (val != skip_val) {
361  agg_count_distinct_bitmap(agg, val, min_val);
362  }
363 }
364 
366  const int64_t,
367  const int64_t,
368  const int64_t,
369  const int64_t,
370  const int64_t,
371  const uint64_t,
372  const uint64_t) {}
373 
374 extern "C" ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg, const int32_t) {
375  return (*agg)++;
376 }
377 
378 extern "C" ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg, const int32_t val) {
379  const auto old = *agg;
380  *agg += val;
381  return old;
382 }
383 
384 #define DEF_AGG_MAX_INT(n) \
385  extern "C" ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, const int##n##_t val) { \
386  *agg = std::max(*agg, val); \
387  }
388 
389 DEF_AGG_MAX_INT(32)
390 DEF_AGG_MAX_INT(16)
392 #undef DEF_AGG_MAX_INT
393 
394 #define DEF_AGG_MIN_INT(n) \
395  extern "C" ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, const int##n##_t val) { \
396  *agg = std::min(*agg, val); \
397  }
398 
399 DEF_AGG_MIN_INT(32)
400 DEF_AGG_MIN_INT(16)
402 #undef DEF_AGG_MIN_INT
403 
404 #define DEF_AGG_ID_INT(n) \
405  extern "C" ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, const int##n##_t val) { \
406  *agg = val; \
407  }
408 
409 DEF_AGG_ID_INT(32)
410 DEF_AGG_ID_INT(16)
412 #undef DEF_AGG_ID_INT
413 
414 #define DEF_WRITE_PROJECTION_INT(n) \
415  extern "C" ALWAYS_INLINE void write_projection_int##n( \
416  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
417  if (val != init_val) { \
418  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
419  } \
420  }
421 
424 #undef DEF_WRITE_PROJECTION_INT
425 
426 extern "C" ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
427  const int64_t val,
428  const int64_t skip_val) {
429  const auto old = *agg;
430  if (val != skip_val) {
431  if (old != skip_val) {
432  return agg_sum(agg, val);
433  } else {
434  *agg = val;
435  }
436  }
437  return old;
438 }
439 
440 extern "C" ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t* agg,
441  const int32_t val,
442  const int32_t skip_val) {
443  const auto old = *agg;
444  if (val != skip_val) {
445  if (old != skip_val) {
446  return agg_sum_int32(agg, val);
447  } else {
448  *agg = val;
449  }
450  }
451  return old;
452 }
453 
454 extern "C" ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t* agg,
455  const int64_t val,
456  const int64_t skip_val) {
457  if (val != skip_val) {
458  return agg_count(agg, val);
459  }
460  return *agg;
461 }
462 
463 extern "C" ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t* agg,
464  const int32_t val,
465  const int32_t skip_val) {
466  if (val != skip_val) {
467  return agg_count_int32(agg, val);
468  }
469  return *agg;
470 }
471 
472 #define DEF_SKIP_AGG_ADD(base_agg_func) \
473  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
474  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
475  if (val != skip_val) { \
476  base_agg_func(agg, val); \
477  } \
478  }
479 
480 #define DEF_SKIP_AGG(base_agg_func) \
481  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
482  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
483  if (val != skip_val) { \
484  const DATA_T old_agg = *agg; \
485  if (old_agg != skip_val) { \
486  base_agg_func(agg, val); \
487  } else { \
488  *agg = val; \
489  } \
490  } \
491  }
492 
493 #define DATA_T int64_t
496 #undef DATA_T
497 
498 #define DATA_T int32_t
501 #undef DATA_T
502 
503 #define DATA_T int16_t
506 #undef DATA_T
507 
508 #define DATA_T int8_t
511 #undef DATA_T
512 
513 #undef DEF_SKIP_AGG_ADD
514 #undef DEF_SKIP_AGG
515 
516 // TODO(alex): fix signature
517 
518 extern "C" ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg, const double val) {
519  return (*agg)++;
520 }
521 
522 extern "C" ALWAYS_INLINE void agg_sum_double(int64_t* agg, const double val) {
523  const auto r = *reinterpret_cast<const double*>(agg) + val;
524  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
525 }
526 
527 extern "C" ALWAYS_INLINE void agg_max_double(int64_t* agg, const double val) {
528  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
529  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
530 }
531 
532 extern "C" ALWAYS_INLINE void agg_min_double(int64_t* agg, const double val) {
533  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
534  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
535 }
536 
537 extern "C" ALWAYS_INLINE void agg_id_double(int64_t* agg, const double val) {
538  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
539 }
540 
541 extern "C" ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg, const float val) {
542  return (*agg)++;
543 }
544 
545 extern "C" ALWAYS_INLINE void agg_sum_float(int32_t* agg, const float val) {
546  const auto r = *reinterpret_cast<const float*>(agg) + val;
547  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
548 }
549 
550 extern "C" ALWAYS_INLINE void agg_max_float(int32_t* agg, const float val) {
551  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
552  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
553 }
554 
555 extern "C" ALWAYS_INLINE void agg_min_float(int32_t* agg, const float val) {
556  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
557  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
558 }
559 
560 extern "C" ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
561  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
562 }
563 
564 extern "C" ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t* agg,
565  const double val,
566  const double skip_val) {
567  if (val != skip_val) {
568  return agg_count_double(agg, val);
569  }
570  return *agg;
571 }
572 
573 extern "C" ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t* agg,
574  const float val,
575  const float skip_val) {
576  if (val != skip_val) {
577  return agg_count_float(agg, val);
578  }
579  return *agg;
580 }
581 
582 #define DEF_SKIP_AGG_ADD(base_agg_func) \
583  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
584  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
585  if (val != skip_val) { \
586  base_agg_func(agg, val); \
587  } \
588  }
589 
590 #define DEF_SKIP_AGG(base_agg_func) \
591  extern "C" ALWAYS_INLINE void base_agg_func##_skip_val( \
592  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
593  if (val != skip_val) { \
594  const ADDR_T old_agg = *agg; \
595  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
596  base_agg_func(agg, val); \
597  } else { \
598  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
599  } \
600  } \
601  }
602 
603 #define DATA_T double
604 #define ADDR_T int64_t
608 #undef ADDR_T
609 #undef DATA_T
610 
611 #define DATA_T float
612 #define ADDR_T int32_t
616 #undef ADDR_T
617 #undef DATA_T
618 
619 #undef DEF_SKIP_AGG_ADD
620 #undef DEF_SKIP_AGG
621 
622 extern "C" ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale) {
623  if (x >= 0) {
624  return x / scale * scale;
625  }
626  if (!(x % scale)) {
627  return x;
628  }
629  return x / scale * scale - scale;
630 }
631 
632 extern "C" ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale) {
633  return decimal_floor(x, scale) + (x % scale ? scale : 0);
634 }
635 
636 // Shared memory aggregators. Should never be called,
637 // real implementations are in cuda_mapd_rt.cu.
638 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
639  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
640  const int64_t val) { \
641  return 0; \
642  } \
643  \
644  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
645  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
646  return 0; \
647  } \
648  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
649  const int32_t val) { \
650  return 0; \
651  } \
652  \
653  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
654  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
655  return 0; \
656  } \
657  \
658  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
659  const double val) { \
660  return 0; \
661  } \
662  \
663  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
664  uint64_t* agg, const double val, const double skip_val) { \
665  return 0; \
666  } \
667  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
668  const float val) { \
669  return 0; \
670  } \
671  \
672  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
673  uint32_t* agg, const float val, const float skip_val) { \
674  return 0; \
675  }
676 
677 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
678  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
679  \
680  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
681  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
682  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
683  const int32_t val) {} \
684  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
685  const int16_t val) {} \
686  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
687  const int8_t val) {} \
688  \
689  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
690  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
691  \
692  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
693  const double val) {} \
694  \
695  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
696  int64_t* agg, const double val, const double skip_val) {} \
697  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
698  const float val) {} \
699  \
700  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
701  int32_t* agg, const float val, const float skip_val) {}
702 
707 
708 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
709  const int16_t val,
710  const int16_t skip_val) {}
711 
712 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
713  const int8_t val,
714  const int8_t skip_val) {}
715 
716 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
717  const int16_t val,
718  const int16_t skip_val) {}
719 
720 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
721  const int8_t val,
722  const int8_t skip_val) {}
723 
724 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
725 
726 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
727  return 0;
728 }
729 
730 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
731  const int64_t val,
732  const int64_t skip_val) {
733  return 0;
734 }
735 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
736  return 0;
737 }
738 
739 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
740  const int32_t val,
741  const int32_t skip_val) {
742  return 0;
743 }
744 
745 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
746 
747 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
748  const double val,
749  const double skip_val) {}
750 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
751 
752 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
753  const float val,
754  const float skip_val) {}
755 
756 extern "C" GPU_RT_STUB void force_sync() {}
757 
758 extern "C" GPU_RT_STUB void sync_warp() {}
759 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
760 
761 // x64 stride functions
762 
763 extern "C" __attribute__((noinline)) int32_t pos_start_impl(int32_t* error_code) {
764  int32_t row_index_resume{0};
765  if (error_code) {
766  row_index_resume = error_code[0];
767  error_code[0] = 0;
768  }
769  return row_index_resume;
770 }
771 
772 extern "C" __attribute__((noinline)) int32_t group_buff_idx_impl() {
773  return pos_start_impl(nullptr);
774 }
775 
776 extern "C" __attribute__((noinline)) int32_t pos_step_impl() {
777  return 1;
778 }
779 
780 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
781  return 0;
782 }
783 
784 #undef GPU_RT_STUB
785 
786 extern "C" ALWAYS_INLINE int32_t record_error_code(const int32_t err_code,
787  int32_t* error_codes) {
788  // NB: never override persistent error codes (with code greater than zero).
789  // On GPU, a projection query with a limit can run out of slots without it
790  // being an actual error if the limit has been hit. If a persistent error
791  // (division by zero, for example) occurs before running out of slots, we
792  // have to avoid overriding it, because there's a risk that the query would
793  // go through if we override with a potentially benign out-of-slots code.
794  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
795  error_codes[pos_start_impl(nullptr)] = err_code;
796  }
797  return err_code;
798 }
799 
800 // group by helpers
801 
802 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_nop(
803  const int64_t* groups_buffer,
804  const int32_t groups_buffer_size) {
806 }
807 
808 extern "C" __attribute__((noinline)) void write_back_nop(int64_t* dest,
809  int64_t* src,
810  const int32_t sz) {
811  // the body is not really needed, just make sure the call is not optimized away
812  assert(dest);
813 }
814 
815 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem(
816  const int64_t* groups_buffer,
817  const int32_t groups_buffer_size) {
818  return init_shared_mem_nop(groups_buffer, groups_buffer_size);
819 }
820 
821 extern "C" __attribute__((noinline)) const int64_t* init_shared_mem_dynamic(
822  const int64_t* groups_buffer,
823  const int32_t groups_buffer_size) {
824  return nullptr;
825 }
826 
827 extern "C" __attribute__((noinline)) void write_back(int64_t* dest,
828  int64_t* src,
829  const int32_t sz) {
830  write_back_nop(dest, src, sz);
831 }
832 
833 extern "C" __attribute__((noinline)) void write_back_smem_nop(int64_t* dest,
834  int64_t* src,
835  const int32_t sz) {
836  assert(dest);
837 }
838 
839 extern "C" __attribute__((noinline)) void agg_from_smem_to_gmem_nop(int64_t* dest,
840  int64_t* src,
841  const int32_t sz) {
842  assert(dest);
843 }
844 
845 extern "C" __attribute__((noinline)) void
846 agg_from_smem_to_gmem_count_binId(int64_t* dest, int64_t* src, const int32_t sz) {
847  return agg_from_smem_to_gmem_nop(dest, src, sz);
848 }
849 
850 extern "C" __attribute__((noinline)) void
851 agg_from_smem_to_gmem_binId_count(int64_t* dest, int64_t* src, const int32_t sz) {
852  return agg_from_smem_to_gmem_nop(dest, src, sz);
853 }
854 
855 extern "C" __attribute__((noinline)) void init_group_by_buffer_gpu(
856  int64_t* groups_buffer,
857  const int64_t* init_vals,
858  const uint32_t groups_buffer_entry_count,
859  const uint32_t key_qw_count,
860  const uint32_t agg_col_count,
861  const bool keyless,
862  const int8_t warp_size) {
863  // the body is not really needed, just make sure the call is not optimized away
864  assert(groups_buffer);
865 }
866 
867 extern "C" __attribute__((noinline)) void init_columnar_group_by_buffer_gpu(
868  int64_t* groups_buffer,
869  const int64_t* init_vals,
870  const uint32_t groups_buffer_entry_count,
871  const uint32_t key_qw_count,
872  const uint32_t agg_col_count,
873  const bool keyless,
874  const bool blocks_share_memory,
875  const int32_t frag_idx) {
876  // the body is not really needed, just make sure the call is not optimized away
877  assert(groups_buffer);
878 }
879 
880 extern "C" __attribute__((noinline)) void init_group_by_buffer_impl(
881  int64_t* groups_buffer,
882  const int64_t* init_vals,
883  const uint32_t groups_buffer_entry_count,
884  const uint32_t key_qw_count,
885  const uint32_t agg_col_count,
886  const bool keyless,
887  const int8_t warp_size) {
888  // the body is not really needed, just make sure the call is not optimized away
889  assert(groups_buffer);
890 }
891 
892 template <typename T>
894  const uint32_t h,
895  const T* key,
896  const uint32_t key_count,
897  const uint32_t row_size_quad) {
898  auto off = h * row_size_quad;
899  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
900  if (*row_ptr == get_empty_key<T>()) {
901  memcpy(row_ptr, key, key_count * sizeof(T));
902  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
903  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
904  }
905  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
906  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
907  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
908  }
909  return nullptr;
910 }
911 
913  const uint32_t h,
914  const int64_t* key,
915  const uint32_t key_count,
916  const uint32_t key_width,
917  const uint32_t row_size_quad,
918  const int64_t* init_vals) {
919  switch (key_width) {
920  case 4:
921  return get_matching_group_value(groups_buffer,
922  h,
923  reinterpret_cast<const int32_t*>(key),
924  key_count,
925  row_size_quad);
926  case 8:
927  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
928  default:;
929  }
930  return nullptr;
931 }
932 
933 template <typename T>
935  const uint32_t entry_count,
936  const uint32_t h,
937  const T* key,
938  const uint32_t key_count) {
939  auto off = h;
940  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
941  if (key_buffer[off] == get_empty_key<T>()) {
942  for (size_t i = 0; i < key_count; ++i) {
943  key_buffer[off] = key[i];
944  off += entry_count;
945  }
946  return h;
947  }
948  off = h;
949  for (size_t i = 0; i < key_count; ++i) {
950  if (key_buffer[off] != key[i]) {
951  return -1;
952  }
953  off += entry_count;
954  }
955  return h;
956 }
957 
958 extern "C" ALWAYS_INLINE int32_t
960  const uint32_t entry_count,
961  const uint32_t h,
962  const int64_t* key,
963  const uint32_t key_count,
964  const uint32_t key_width) {
965  switch (key_width) {
966  case 4:
967  return get_matching_group_value_columnar_slot(groups_buffer,
968  entry_count,
969  h,
970  reinterpret_cast<const int32_t*>(key),
971  key_count);
972  case 8:
974  groups_buffer, entry_count, h, key, key_count);
975  default:
976  return -1;
977  }
978  return -1;
979 }
980 
982  int64_t* groups_buffer,
983  const uint32_t h,
984  const int64_t* key,
985  const uint32_t key_qw_count,
986  const size_t entry_count) {
987  auto off = h;
988  if (groups_buffer[off] == EMPTY_KEY_64) {
989  for (size_t i = 0; i < key_qw_count; ++i) {
990  groups_buffer[off] = key[i];
991  off += entry_count;
992  }
993  return &groups_buffer[off];
994  }
995  off = h;
996  for (size_t i = 0; i < key_qw_count; ++i) {
997  if (groups_buffer[off] != key[i]) {
998  return nullptr;
999  }
1000  off += entry_count;
1001  }
1002  return &groups_buffer[off];
1003 }
1004 
1005 /*
1006  * For a particular hashed_index, returns the row-wise offset
1007  * to the first matching agg column in memory.
1008  * It also checks the corresponding group column, and initialize all
1009  * available keys if they are not empty (it is assumed all group columns are
1010  * 64-bit wide).
1011  *
1012  * Memory layout:
1013  *
1014  * | prepended group columns (64-bit each) | agg columns |
1015  */
1017  int64_t* groups_buffer,
1018  const uint32_t hashed_index,
1019  const int64_t* key,
1020  const uint32_t key_count,
1021  const uint32_t row_size_quad) {
1022  uint32_t off = hashed_index * row_size_quad;
1023  if (groups_buffer[off] == EMPTY_KEY_64) {
1024  for (uint32_t i = 0; i < key_count; ++i) {
1025  groups_buffer[off + i] = key[i];
1026  }
1027  }
1028  return groups_buffer + off + key_count;
1029 }
1030 
1031 /*
1032  * For a particular hashed_index, find and initialize (if necessary) all the group
1033  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1034  */
1036  int64_t* groups_buffer,
1037  const uint32_t hashed_index,
1038  const int64_t* key,
1039  const uint32_t key_count,
1040  const uint32_t entry_count) {
1041  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1042  for (uint32_t i = 0; i < key_count; i++) {
1043  groups_buffer[i * entry_count + hashed_index] = key[i];
1044  }
1045  }
1046 }
1047 
1048 #include "GroupByRuntime.cpp"
1050 
1052  int64_t* groups_buffer,
1053  const int64_t key,
1054  const int64_t min_key,
1055  const int64_t /* bucket */,
1056  const uint32_t row_size_quad) {
1057  return groups_buffer + row_size_quad * (key - min_key);
1058 }
1059 
1061  int64_t* groups_buffer,
1062  const int64_t key,
1063  const int64_t min_key,
1064  const int64_t /* bucket */,
1065  const uint32_t row_size_quad,
1066  const uint8_t thread_warp_idx,
1067  const uint8_t warp_size) {
1068  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1069 }
1070 
1071 extern "C" ALWAYS_INLINE int8_t* extract_str_ptr(const uint64_t str_and_len) {
1072  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1073 }
1074 
1075 extern "C" ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len) {
1076  return static_cast<int64_t>(str_and_len) >> 48;
1077 }
1078 
1079 extern "C" __attribute__((noinline)) int8_t* extract_str_ptr_noinline(
1080  const uint64_t str_and_len) {
1081  return extract_str_ptr(str_and_len);
1082 }
1083 
1084 extern "C" __attribute__((noinline)) int32_t extract_str_len_noinline(
1085  const uint64_t str_and_len) {
1086  return extract_str_len(str_and_len);
1087 }
1088 
1089 extern "C" ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr, const int32_t len) {
1090  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1091  (static_cast<const uint64_t>(len) << 48);
1092 }
1093 
1094 #ifdef __clang__
1095 #include "../Utils/StringLike.cpp"
1096 #endif
1097 
1098 #ifndef __CUDACC__
1099 #include "TopKRuntime.cpp"
1100 #endif
1101 
1102 extern "C" ALWAYS_INLINE DEVICE int32_t char_length(const char* str,
1103  const int32_t str_len) {
1104  return str_len;
1105 }
1106 
1107 extern "C" ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char* str,
1108  const int32_t str_len,
1109  const int32_t int_null) {
1110  if (!str) {
1111  return int_null;
1112  }
1113  return str_len;
1114 }
1115 
1116 extern "C" ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id) {
1117  return str_id;
1118 }
1119 
1120 extern "C" ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff,
1121  const int64_t pos) {
1122  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1123 }
1124 
1125 extern "C" ALWAYS_INLINE double percent_window_func(const int64_t output_buff,
1126  const int64_t pos) {
1127  return reinterpret_cast<const double*>(output_buff)[pos];
1128 }
1129 
1130 extern "C" ALWAYS_INLINE double load_double(const int64_t* agg) {
1131  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1132 }
1133 
1134 extern "C" ALWAYS_INLINE float load_float(const int32_t* agg) {
1135  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1136 }
1137 
1138 extern "C" ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1139  const int64_t* count,
1140  const double null_val) {
1141  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1142 }
1143 
1144 extern "C" ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1145  const int64_t* count,
1146  const double null_val,
1147  const uint32_t scale) {
1148  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1149 }
1150 
1151 extern "C" ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1152  const int64_t* count,
1153  const double null_val) {
1154  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1155  : null_val;
1156 }
1157 
1158 extern "C" ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1159  const int32_t* count,
1160  const double null_val) {
1161  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1162  : null_val;
1163 }
1164 
1165 extern "C" NEVER_INLINE void linear_probabilistic_count(uint8_t* bitmap,
1166  const uint32_t bitmap_bytes,
1167  const uint8_t* key_bytes,
1168  const uint32_t key_len) {
1169  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1170  const uint32_t word_idx = bit_pos / 32;
1171  const uint32_t bit_idx = bit_pos % 32;
1172  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1173 }
1174 
1175 extern "C" __attribute__((noinline)) void query_stub_hoisted_literals(
1176  const int8_t** col_buffers,
1177  const int8_t* literals,
1178  const int64_t* num_rows,
1179  const uint64_t* frag_row_offsets,
1180  const int32_t* max_matched,
1181  const int64_t* init_agg_value,
1182  int64_t** out,
1183  uint32_t frag_idx,
1184  const int64_t* join_hash_tables,
1185  int32_t* error_code,
1186  int32_t* total_matched) {
1187  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1188  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1189  total_matched);
1190 }
1191 
1192 extern "C" void multifrag_query_hoisted_literals(const int8_t*** col_buffers,
1193  const uint64_t* num_fragments,
1194  const int8_t* literals,
1195  const int64_t* num_rows,
1196  const uint64_t* frag_row_offsets,
1197  const int32_t* max_matched,
1198  int32_t* total_matched,
1199  const int64_t* init_agg_value,
1200  int64_t** out,
1201  int32_t* error_code,
1202  const uint32_t* num_tables_ptr,
1203  const int64_t* join_hash_tables) {
1204  for (uint32_t i = 0; i < *num_fragments; ++i) {
1205  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1206  literals,
1207  &num_rows[i * (*num_tables_ptr)],
1208  &frag_row_offsets[i * (*num_tables_ptr)],
1209  max_matched,
1210  init_agg_value,
1211  out,
1212  i,
1213  join_hash_tables,
1214  total_matched,
1215  error_code);
1216  }
1217 }
1218 
1219 extern "C" __attribute__((noinline)) void query_stub(const int8_t** col_buffers,
1220  const int64_t* num_rows,
1221  const uint64_t* frag_row_offsets,
1222  const int32_t* max_matched,
1223  const int64_t* init_agg_value,
1224  int64_t** out,
1225  uint32_t frag_idx,
1226  const int64_t* join_hash_tables,
1227  int32_t* error_code,
1228  int32_t* total_matched) {
1229  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1230  out || frag_idx || error_code || join_hash_tables || total_matched);
1231 }
1232 
1233 extern "C" void multifrag_query(const int8_t*** col_buffers,
1234  const uint64_t* num_fragments,
1235  const int64_t* num_rows,
1236  const uint64_t* frag_row_offsets,
1237  const int32_t* max_matched,
1238  int32_t* total_matched,
1239  const int64_t* init_agg_value,
1240  int64_t** out,
1241  int32_t* error_code,
1242  const uint32_t* num_tables_ptr,
1243  const int64_t* join_hash_tables) {
1244  for (uint32_t i = 0; i < *num_fragments; ++i) {
1245  query_stub(col_buffers ? col_buffers[i] : nullptr,
1246  &num_rows[i * (*num_tables_ptr)],
1247  &frag_row_offsets[i * (*num_tables_ptr)],
1248  max_matched,
1249  init_agg_value,
1250  out,
1251  i,
1252  join_hash_tables,
1253  total_matched,
1254  error_code);
1255  }
1256 }
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_from_smem_to_gmem_count_binId(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
#define DEF_UMINUS_NULLABLE(type, null_type)
const int32_t groups_buffer_size return groups_buffer
int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t * join_hash_tables
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
#define EMPTY_KEY_64
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:33
#define GPU_RT_STUB
const int8_t const int64_t * num_rows
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
void agg_min_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
__device__ const int64_t * init_shared_mem(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:36
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:23
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
#define DEF_CAST_NULLABLE(from_type, to_type)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:19
ALWAYS_INLINE double load_double(const int64_t *agg)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:27
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
#define DEF_AGG_MAX_INT(n)
void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
int64_t * src
ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
void agg_max_int16(int16_t *agg, const int16_t val)
void agg_min_int8(int8_t *agg, const int8_t val)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
const int64_t const uint32_t groups_buffer_entry_count
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
#define DEVICE
ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define DEF_SKIP_AGG(base_agg_func)
#define DEF_WRITE_PROJECTION_INT(n)
ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
__device__ const int64_t * init_shared_mem_dynamic(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:77
ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
int32_t extract_str_len_noinline(const uint64_t str_and_len)
void agg_min_int16(int16_t *agg, const int16_t val)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t const int64_t int32_t * error_code
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
#define DEF_AGG_MIN_INT(n)
ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:11
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
const int8_t const int64_t const uint64_t const int32_t * max_matched
__device__ void sync_warp()
ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
__device__ void agg_from_smem_to_gmem_nop(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
Definition: cuda_mapd_rt.cu:96
ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
int64_t const int32_t sz assert(dest)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t uint32_t frag_idx
__device__ void write_back(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:85
const int8_t const int64_t const uint64_t const int32_t const int64_t * init_agg_value
void agg_max_int32(int32_t *agg, const int32_t val)
ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
__device__ void agg_from_smem_to_gmem_binId_count(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
#define NEVER_INLINE
ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
const int8_t * literals
const int64_t const uint32_t const uint32_t const uint32_t const bool const bool blocks_share_memory
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
ALWAYS_INLINE float load_float(const int32_t *agg)
ALWAYS_INLINE int32_t record_error_code(const int32_t err_code, int32_t *error_codes)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
const int8_t const int64_t const uint64_t * frag_row_offsets
ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
const int8_t const int64_t const uint64_t const int32_t const int64_t int64_t ** out
void agg_max_int8(int8_t *agg, const int8_t val)
__attribute__((noinline)) int32_t pos_start_impl(int32_t *error_code)
const int64_t * init_vals
#define DEF_SHARED_AGG_STUBS(base_agg_func)
ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ void force_sync()
ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
__device__ void write_back_smem_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:92
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:15