OmniSciDB  91042dcc5b
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
RuntimeFunctions.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef __CUDACC__
18 #error This code is not intended to be compiled with a CUDA C++ compiler
19 #endif // __CUDACC__
20 
21 #include "RuntimeFunctions.h"
22 #include "../Shared/funcannotations.h"
23 #include "BufferCompaction.h"
24 #include "HyperLogLogRank.h"
25 #include "MurmurHash.h"
26 #include "Shared/quantile.h"
27 #include "TypePunning.h"
28 
29 #include <algorithm>
30 #include <atomic>
31 #include <chrono>
32 #include <cmath>
33 #include <cstdio>
34 #include <cstring>
35 #include <thread>
36 #include <tuple>
37 
38 // decoder implementations
39 
40 #include "DecodersImpl.h"
41 
42 // arithmetic operator implementations
43 
44 #define DEF_ARITH_NULLABLE(type, null_type, opname, opsym) \
45  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable( \
46  const type lhs, const type rhs, const null_type null_val) { \
47  if (lhs != null_val && rhs != null_val) { \
48  return lhs opsym rhs; \
49  } \
50  return null_val; \
51  }
52 
53 #define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym) \
54  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_lhs( \
55  const type lhs, const type rhs, const null_type null_val) { \
56  if (lhs != null_val) { \
57  return lhs opsym rhs; \
58  } \
59  return null_val; \
60  }
61 
62 #define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym) \
63  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type opname##_##type##_nullable_rhs( \
64  const type lhs, const type rhs, const null_type null_val) { \
65  if (rhs != null_val) { \
66  return lhs opsym rhs; \
67  } \
68  return null_val; \
69  }
70 
71 #define DEF_CMP_NULLABLE(type, null_type, opname, opsym) \
72  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable( \
73  const type lhs, \
74  const type rhs, \
75  const null_type null_val, \
76  const int8_t null_bool_val) { \
77  if (lhs != null_val && rhs != null_val) { \
78  return lhs opsym rhs; \
79  } \
80  return null_bool_val; \
81  }
82 
83 #define DEF_CMP_NULLABLE_LHS(type, null_type, opname, opsym) \
84  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_lhs( \
85  const type lhs, \
86  const type rhs, \
87  const null_type null_val, \
88  const int8_t null_bool_val) { \
89  if (lhs != null_val) { \
90  return lhs opsym rhs; \
91  } \
92  return null_bool_val; \
93  }
94 
95 #define DEF_CMP_NULLABLE_RHS(type, null_type, opname, opsym) \
96  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t opname##_##type##_nullable_rhs( \
97  const type lhs, \
98  const type rhs, \
99  const null_type null_val, \
100  const int8_t null_bool_val) { \
101  if (rhs != null_val) { \
102  return lhs opsym rhs; \
103  } \
104  return null_bool_val; \
105  }
106 
107 #define DEF_SAFE_DIV_NULLABLE(type, null_type, opname) \
108  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type safe_div_##type( \
109  const type lhs, const type rhs, const null_type null_val) { \
110  if (lhs != null_val && rhs != null_val && rhs != 0) { \
111  return lhs / rhs; \
112  } \
113  return null_val; \
114  }
115 
116 #define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type) \
117  DEF_ARITH_NULLABLE(type, null_type, add, +) \
118  DEF_ARITH_NULLABLE(type, null_type, sub, -) \
119  DEF_ARITH_NULLABLE(type, null_type, mul, *) \
120  DEF_ARITH_NULLABLE(type, null_type, div, /) \
121  DEF_SAFE_DIV_NULLABLE(type, null_type, safe_div) \
122  DEF_ARITH_NULLABLE_LHS(type, null_type, add, +) \
123  DEF_ARITH_NULLABLE_LHS(type, null_type, sub, -) \
124  DEF_ARITH_NULLABLE_LHS(type, null_type, mul, *) \
125  DEF_ARITH_NULLABLE_LHS(type, null_type, div, /) \
126  DEF_ARITH_NULLABLE_RHS(type, null_type, add, +) \
127  DEF_ARITH_NULLABLE_RHS(type, null_type, sub, -) \
128  DEF_ARITH_NULLABLE_RHS(type, null_type, mul, *) \
129  DEF_ARITH_NULLABLE_RHS(type, null_type, div, /) \
130  DEF_CMP_NULLABLE(type, null_type, eq, ==) \
131  DEF_CMP_NULLABLE(type, null_type, ne, !=) \
132  DEF_CMP_NULLABLE(type, null_type, lt, <) \
133  DEF_CMP_NULLABLE(type, null_type, gt, >) \
134  DEF_CMP_NULLABLE(type, null_type, le, <=) \
135  DEF_CMP_NULLABLE(type, null_type, ge, >=) \
136  DEF_CMP_NULLABLE_LHS(type, null_type, eq, ==) \
137  DEF_CMP_NULLABLE_LHS(type, null_type, ne, !=) \
138  DEF_CMP_NULLABLE_LHS(type, null_type, lt, <) \
139  DEF_CMP_NULLABLE_LHS(type, null_type, gt, >) \
140  DEF_CMP_NULLABLE_LHS(type, null_type, le, <=) \
141  DEF_CMP_NULLABLE_LHS(type, null_type, ge, >=) \
142  DEF_CMP_NULLABLE_RHS(type, null_type, eq, ==) \
143  DEF_CMP_NULLABLE_RHS(type, null_type, ne, !=) \
144  DEF_CMP_NULLABLE_RHS(type, null_type, lt, <) \
145  DEF_CMP_NULLABLE_RHS(type, null_type, gt, >) \
146  DEF_CMP_NULLABLE_RHS(type, null_type, le, <=) \
147  DEF_CMP_NULLABLE_RHS(type, null_type, ge, >=)
148 
149 DEF_BINARY_NULLABLE_ALL_OPS(int8_t, int64_t)
150 DEF_BINARY_NULLABLE_ALL_OPS(int16_t, int64_t)
151 DEF_BINARY_NULLABLE_ALL_OPS(int32_t, int64_t)
152 DEF_BINARY_NULLABLE_ALL_OPS(int64_t, int64_t)
153 DEF_BINARY_NULLABLE_ALL_OPS(float, float)
154 DEF_BINARY_NULLABLE_ALL_OPS(double, double)
155 DEF_ARITH_NULLABLE(int8_t, int64_t, mod, %)
156 DEF_ARITH_NULLABLE(int16_t, int64_t, mod, %)
157 DEF_ARITH_NULLABLE(int32_t, int64_t, mod, %)
158 DEF_ARITH_NULLABLE(int64_t, int64_t, mod, %)
159 DEF_ARITH_NULLABLE_LHS(int8_t, int64_t, mod, %)
160 DEF_ARITH_NULLABLE_LHS(int16_t, int64_t, mod, %)
161 DEF_ARITH_NULLABLE_LHS(int32_t, int64_t, mod, %)
162 DEF_ARITH_NULLABLE_LHS(int64_t, int64_t, mod, %)
163 DEF_ARITH_NULLABLE_RHS(int8_t, int64_t, mod, %)
164 DEF_ARITH_NULLABLE_RHS(int16_t, int64_t, mod, %)
165 DEF_ARITH_NULLABLE_RHS(int32_t, int64_t, mod, %)
166 DEF_ARITH_NULLABLE_RHS(int64_t, int64_t, mod, %)
167 
168 #undef DEF_BINARY_NULLABLE_ALL_OPS
169 #undef DEF_SAFE_DIV_NULLABLE
170 #undef DEF_CMP_NULLABLE_RHS
171 #undef DEF_CMP_NULLABLE_LHS
172 #undef DEF_CMP_NULLABLE
173 #undef DEF_ARITH_NULLABLE_RHS
174 #undef DEF_ARITH_NULLABLE_LHS
175 #undef DEF_ARITH_NULLABLE
176 
177 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
178 scale_decimal_up(const int64_t operand,
179  const uint64_t scale,
180  const int64_t operand_null_val,
181  const int64_t result_null_val) {
182  return operand != operand_null_val ? operand * scale : result_null_val;
183 }
184 
185 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
186 scale_decimal_down_nullable(const int64_t operand,
187  const int64_t scale,
188  const int64_t null_val) {
189  // rounded scale down of a decimal
190  if (operand == null_val) {
191  return null_val;
192  }
193 
194  int64_t tmp = scale >> 1;
195  tmp = operand >= 0 ? operand + tmp : operand - tmp;
196  return tmp / scale;
197 }
198 
199 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
200 scale_decimal_down_not_nullable(const int64_t operand,
201  const int64_t scale,
202  const int64_t null_val) {
203  int64_t tmp = scale >> 1;
204  tmp = operand >= 0 ? operand + tmp : operand - tmp;
205  return tmp / scale;
206 }
207 
208 // Return floor(dividend / divisor).
209 // Assumes 0 < divisor.
210 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend,
211  const int64_t divisor) {
212  return (dividend < 0 ? dividend - (divisor - 1) : dividend) / divisor;
213 }
214 
215 // Return floor(dividend / divisor) or NULL if dividend IS NULL.
216 // Assumes 0 < divisor.
217 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
218 floor_div_nullable_lhs(const int64_t dividend,
219  const int64_t divisor,
220  const int64_t null_val) {
221  return dividend == null_val ? null_val : floor_div_lhs(dividend, divisor);
222 }
223 
224 #define DEF_UMINUS_NULLABLE(type, null_type) \
225  extern "C" RUNTIME_EXPORT ALWAYS_INLINE type uminus_##type##_nullable( \
226  const type operand, const null_type null_val) { \
227  return operand == null_val ? null_val : -operand; \
228  }
229 
230 DEF_UMINUS_NULLABLE(int8_t, int8_t)
231 DEF_UMINUS_NULLABLE(int16_t, int16_t)
232 DEF_UMINUS_NULLABLE(int32_t, int32_t)
233 DEF_UMINUS_NULLABLE(int64_t, int64_t)
234 DEF_UMINUS_NULLABLE(float, float)
235 DEF_UMINUS_NULLABLE(double, double)
236 
237 #undef DEF_UMINUS_NULLABLE
238 
239 #define DEF_CAST_NULLABLE(from_type, to_type) \
240  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
241  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
242  const from_type from_null_val, \
243  const to_type to_null_val) { \
244  return operand == from_null_val ? to_null_val : operand; \
245  }
246 
247 #define DEF_CAST_SCALED_NULLABLE(from_type, to_type) \
248  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
249  cast_##from_type##_to_##to_type##_scaled_nullable(const from_type operand, \
250  const from_type from_null_val, \
251  const to_type to_null_val, \
252  const to_type multiplier) { \
253  return operand == from_null_val ? to_null_val : multiplier * operand; \
254  }
255 
256 #define DEF_CAST_NULLABLE_BIDIR(type1, type2) \
257  DEF_CAST_NULLABLE(type1, type2) \
258  DEF_CAST_NULLABLE(type2, type1)
259 
260 #define DEF_ROUND_NULLABLE(from_type, to_type) \
261  extern "C" RUNTIME_EXPORT ALWAYS_INLINE to_type \
262  cast_##from_type##_to_##to_type##_nullable(const from_type operand, \
263  const from_type from_null_val, \
264  const to_type to_null_val) { \
265  return operand == from_null_val \
266  ? to_null_val \
267  : static_cast<to_type>(operand + (operand < from_type(0) \
268  ? from_type(-0.5) \
269  : from_type(0.5))); \
270  }
271 
272 DEF_CAST_NULLABLE_BIDIR(int8_t, int16_t)
273 DEF_CAST_NULLABLE_BIDIR(int8_t, int32_t)
274 DEF_CAST_NULLABLE_BIDIR(int8_t, int64_t)
275 DEF_CAST_NULLABLE_BIDIR(int16_t, int32_t)
276 DEF_CAST_NULLABLE_BIDIR(int16_t, int64_t)
277 DEF_CAST_NULLABLE_BIDIR(int32_t, int64_t)
278 DEF_CAST_NULLABLE_BIDIR(float, double)
279 
280 DEF_CAST_NULLABLE(int8_t, float)
281 DEF_CAST_NULLABLE(int16_t, float)
282 DEF_CAST_NULLABLE(int32_t, float)
283 DEF_CAST_NULLABLE(int64_t, float)
284 DEF_CAST_NULLABLE(int8_t, double)
285 DEF_CAST_NULLABLE(int16_t, double)
286 DEF_CAST_NULLABLE(int32_t, double)
287 DEF_CAST_NULLABLE(int64_t, double)
288 
289 DEF_ROUND_NULLABLE(float, int8_t)
290 DEF_ROUND_NULLABLE(float, int16_t)
291 DEF_ROUND_NULLABLE(float, int32_t)
292 DEF_ROUND_NULLABLE(float, int64_t)
293 DEF_ROUND_NULLABLE(double, int8_t)
294 DEF_ROUND_NULLABLE(double, int16_t)
295 DEF_ROUND_NULLABLE(double, int32_t)
296 DEF_ROUND_NULLABLE(double, int64_t)
297 
298 DEF_CAST_NULLABLE(uint8_t, int32_t)
299 DEF_CAST_NULLABLE(uint16_t, int32_t)
300 DEF_CAST_SCALED_NULLABLE(int64_t, float)
301 DEF_CAST_SCALED_NULLABLE(int64_t, double)
302 
303 #undef DEF_ROUND_NULLABLE
304 #undef DEF_CAST_NULLABLE_BIDIR
305 #undef DEF_CAST_SCALED_NULLABLE
306 #undef DEF_CAST_NULLABLE
307 
308 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand,
309  const int8_t null_val) {
310  return operand == null_val ? operand : (operand ? 0 : 1);
311 }
312 
313 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs,
314  const int8_t rhs,
315  const int8_t null_val) {
316  if (lhs == null_val) {
317  return rhs == 0 ? rhs : null_val;
318  }
319  if (rhs == null_val) {
320  return lhs == 0 ? lhs : null_val;
321  }
322  return (lhs && rhs) ? 1 : 0;
323 }
324 
325 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs,
326  const int8_t rhs,
327  const int8_t null_val) {
328  if (lhs == null_val) {
329  return rhs == 0 ? null_val : rhs;
330  }
331  if (rhs == null_val) {
332  return lhs == 0 ? null_val : lhs;
333  }
334  return (lhs || rhs) ? 1 : 0;
335 }
336 
337 // aggregator implementations
338 
339 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t* agg, const int64_t) {
340  return (*agg)++;
341 }
342 
343 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void
344 agg_count_distinct_bitmap(int64_t* agg, const int64_t val, const int64_t min_val) {
345  const uint64_t bitmap_idx = val - min_val;
346  reinterpret_cast<int8_t*>(*agg)[bitmap_idx >> 3] |= (1 << (bitmap_idx & 7));
347 }
348 
349 #ifdef _MSC_VER
350 #define GPU_RT_STUB NEVER_INLINE
351 #else
352 #define GPU_RT_STUB NEVER_INLINE __attribute__((optnone))
353 #endif
354 
356  const int64_t,
357  const int64_t,
358  const int64_t,
359  const int64_t,
360  const uint64_t,
361  const uint64_t) {}
362 
363 extern "C" RUNTIME_EXPORT NEVER_INLINE void
364 agg_approximate_count_distinct(int64_t* agg, const int64_t key, const uint32_t b) {
365  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
366  const uint32_t index = hash >> (64 - b);
367  const uint8_t rank = get_rank(hash << b, 64 - b);
368  uint8_t* M = reinterpret_cast<uint8_t*>(*agg);
369  M[index] = std::max(M[index], rank);
370 }
371 
373  const int64_t,
374  const uint32_t,
375  const int64_t,
376  const int64_t) {}
377 
378 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset,
379  const int64_t val,
380  const int64_t min_val,
381  const int64_t max_val,
382  const int64_t null_val,
383  const int8_t null_bool_val) {
384  if (val == null_val) {
385  return null_bool_val;
386  }
387  if (val < min_val || val > max_val) {
388  return 0;
389  }
390  if (!bitset) {
391  return 0;
392  }
393  const uint64_t bitmap_idx = val - min_val;
394  return (reinterpret_cast<const int8_t*>(bitset))[bitmap_idx >> 3] &
395  (1 << (bitmap_idx & 7))
396  ? 1
397  : 0;
398 }
399 
400 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t* agg, const int64_t val) {
401  const auto old = *agg;
402  *agg += val;
403  return old;
404 }
405 
406 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t* agg, const int64_t val) {
407  *agg = std::max(*agg, val);
408 }
409 
410 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t* agg, const int64_t val) {
411  *agg = std::min(*agg, val);
412 }
413 
414 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t* agg, const int64_t val) {
415  *agg = val;
416 }
417 
418 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int8_t* agg_id_varlen(int8_t* varlen_buffer,
419  const int64_t offset,
420  const int8_t* value,
421  const int64_t size_bytes) {
422  for (auto i = 0; i < size_bytes; i++) {
423  varlen_buffer[offset + i] = value[i];
424  }
425  return &varlen_buffer[offset];
426 }
427 
428 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
429 checked_single_agg_id(int64_t* agg, const int64_t val, const int64_t null_val) {
430  if (val == null_val) {
431  return 0;
432  }
433 
434  if (*agg == val) {
435  return 0;
436  } else if (*agg == null_val) {
437  *agg = val;
438  return 0;
439  } else {
440  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
441  return 15;
442  }
443 }
444 
446  int64_t* agg,
447  const int64_t val,
448  const int64_t min_val,
449  const int64_t skip_val) {
450  if (val != skip_val) {
451  agg_count_distinct_bitmap(agg, val, min_val);
452  }
453 }
454 
456  const int64_t,
457  const int64_t,
458  const int64_t,
459  const int64_t,
460  const int64_t,
461  const uint64_t,
462  const uint64_t) {}
463 
464 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t* agg,
465  const int32_t) {
466  return (*agg)++;
467 }
468 
469 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t* agg,
470  const int32_t val) {
471  const auto old = *agg;
472  *agg += val;
473  return old;
474 }
475 
476 #define DEF_AGG_MAX_INT(n) \
477  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_int##n(int##n##_t* agg, \
478  const int##n##_t val) { \
479  *agg = std::max(*agg, val); \
480  }
481 
482 DEF_AGG_MAX_INT(32)
483 DEF_AGG_MAX_INT(16)
485 #undef DEF_AGG_MAX_INT
486 
487 #define DEF_AGG_MIN_INT(n) \
488  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_int##n(int##n##_t* agg, \
489  const int##n##_t val) { \
490  *agg = std::min(*agg, val); \
491  }
492 
493 DEF_AGG_MIN_INT(32)
494 DEF_AGG_MIN_INT(16)
496 #undef DEF_AGG_MIN_INT
497 
498 #define DEF_AGG_ID_INT(n) \
499  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_int##n(int##n##_t* agg, \
500  const int##n##_t val) { \
501  *agg = val; \
502  }
503 
504 #define DEF_CHECKED_SINGLE_AGG_ID_INT(n) \
505  extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_int##n( \
506  int##n##_t* agg, const int##n##_t val, const int##n##_t null_val) { \
507  if (val == null_val) { \
508  return 0; \
509  } \
510  if (*agg == val) { \
511  return 0; \
512  } else if (*agg == null_val) { \
513  *agg = val; \
514  return 0; \
515  } else { \
516  /* see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES*/ \
517  return 15; \
518  } \
519  }
520 
521 DEF_AGG_ID_INT(32)
522 DEF_AGG_ID_INT(16)
524 
528 
529 #undef DEF_AGG_ID_INT
530 #undef DEF_CHECKED_SINGLE_AGG_ID_INT
531 
532 #define DEF_WRITE_PROJECTION_INT(n) \
533  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void write_projection_int##n( \
534  int8_t* slot_ptr, const int##n##_t val, const int64_t init_val) { \
535  if (val != init_val) { \
536  *reinterpret_cast<int##n##_t*>(slot_ptr) = val; \
537  } \
538  }
539 
542 #undef DEF_WRITE_PROJECTION_INT
543 
544 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t* agg,
545  const int64_t val,
546  const int64_t skip_val) {
547  const auto old = *agg;
548  if (val != skip_val) {
549  if (old != skip_val) {
550  return agg_sum(agg, val);
551  } else {
552  *agg = val;
553  }
554  }
555  return old;
556 }
557 
558 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
559 agg_sum_int32_skip_val(int32_t* agg, const int32_t val, const int32_t skip_val) {
560  const auto old = *agg;
561  if (val != skip_val) {
562  if (old != skip_val) {
563  return agg_sum_int32(agg, val);
564  } else {
565  *agg = val;
566  }
567  }
568  return old;
569 }
570 
571 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
572 agg_count_skip_val(uint64_t* agg, const int64_t val, const int64_t skip_val) {
573  if (val != skip_val) {
574  return agg_count(agg, val);
575  }
576  return *agg;
577 }
578 
579 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
580 agg_count_int32_skip_val(uint32_t* agg, const int32_t val, const int32_t skip_val) {
581  if (val != skip_val) {
582  return agg_count_int32(agg, val);
583  }
584  return *agg;
585 }
586 
587 #define DEF_SKIP_AGG_ADD(base_agg_func) \
588  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
589  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
590  if (val != skip_val) { \
591  base_agg_func(agg, val); \
592  } \
593  }
594 
595 #define DEF_SKIP_AGG(base_agg_func) \
596  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
597  DATA_T* agg, const DATA_T val, const DATA_T skip_val) { \
598  if (val != skip_val) { \
599  const DATA_T old_agg = *agg; \
600  if (old_agg != skip_val) { \
601  base_agg_func(agg, val); \
602  } else { \
603  *agg = val; \
604  } \
605  } \
606  }
607 
608 #define DATA_T int64_t
611 #undef DATA_T
612 
613 #define DATA_T int32_t
616 #undef DATA_T
617 
618 #define DATA_T int16_t
621 #undef DATA_T
622 
623 #define DATA_T int8_t
626 #undef DATA_T
627 
628 #undef DEF_SKIP_AGG_ADD
629 #undef DEF_SKIP_AGG
630 
631 // TODO(alex): fix signature
632 
633 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t* agg,
634  const double val) {
635  return (*agg)++;
636 }
637 
638 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t* agg,
639  const double val) {
640  const auto r = *reinterpret_cast<const double*>(agg) + val;
641  *agg = *reinterpret_cast<const int64_t*>(may_alias_ptr(&r));
642 }
643 
644 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t* agg,
645  const double val) {
646  const auto r = std::max(*reinterpret_cast<const double*>(agg), val);
647  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
648 }
649 
650 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t* agg,
651  const double val) {
652  const auto r = std::min(*reinterpret_cast<const double*>(agg), val);
653  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&r)));
654 }
655 
656 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t* agg,
657  const double val) {
658  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
659 }
660 
661 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
662 checked_single_agg_id_double(int64_t* agg, const double val, const double null_val) {
663  if (val == null_val) {
664  return 0;
665  }
666 
667  if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)))) {
668  return 0;
669  } else if (*agg == *(reinterpret_cast<const int64_t*>(may_alias_ptr(&null_val)))) {
670  *agg = *(reinterpret_cast<const int64_t*>(may_alias_ptr(&val)));
671  return 0;
672  } else {
673  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
674  return 15;
675  }
676 }
677 
678 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t* agg,
679  const float val) {
680  return (*agg)++;
681 }
682 
683 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t* agg,
684  const float val) {
685  const auto r = *reinterpret_cast<const float*>(agg) + val;
686  *agg = *reinterpret_cast<const int32_t*>(may_alias_ptr(&r));
687 }
688 
689 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t* agg,
690  const float val) {
691  const auto r = std::max(*reinterpret_cast<const float*>(agg), val);
692  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
693 }
694 
695 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t* agg,
696  const float val) {
697  const auto r = std::min(*reinterpret_cast<const float*>(agg), val);
698  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&r)));
699 }
700 
701 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t* agg, const float val) {
702  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
703 }
704 
705 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
706 checked_single_agg_id_float(int32_t* agg, const float val, const float null_val) {
707  if (val == null_val) {
708  return 0;
709  }
710 
711  if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)))) {
712  return 0;
713  } else if (*agg == *(reinterpret_cast<const int32_t*>(may_alias_ptr(&null_val)))) {
714  *agg = *(reinterpret_cast<const int32_t*>(may_alias_ptr(&val)));
715  return 0;
716  } else {
717  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
718  return 15;
719  }
720 }
721 
722 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t
723 agg_count_double_skip_val(uint64_t* agg, const double val, const double skip_val) {
724  if (val != skip_val) {
725  return agg_count_double(agg, val);
726  }
727  return *agg;
728 }
729 
730 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint32_t
731 agg_count_float_skip_val(uint32_t* agg, const float val, const float skip_val) {
732  if (val != skip_val) {
733  return agg_count_float(agg, val);
734  }
735  return *agg;
736 }
737 
738 #define DEF_SKIP_AGG_ADD(base_agg_func) \
739  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
740  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
741  if (val != skip_val) { \
742  base_agg_func(agg, val); \
743  } \
744  }
745 
746 #define DEF_SKIP_AGG(base_agg_func) \
747  extern "C" RUNTIME_EXPORT ALWAYS_INLINE void base_agg_func##_skip_val( \
748  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
749  if (val != skip_val) { \
750  const ADDR_T old_agg = *agg; \
751  if (old_agg != *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&skip_val))) { \
752  base_agg_func(agg, val); \
753  } else { \
754  *agg = *reinterpret_cast<const ADDR_T*>(may_alias_ptr(&val)); \
755  } \
756  } \
757  }
758 
759 #define DATA_T double
760 #define ADDR_T int64_t
764 #undef ADDR_T
765 #undef DATA_T
766 
767 #define DATA_T float
768 #define ADDR_T int32_t
772 #undef ADDR_T
773 #undef DATA_T
774 
775 #undef DEF_SKIP_AGG_ADD
776 #undef DEF_SKIP_AGG
777 
778 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x,
779  const int64_t scale) {
780  if (x >= 0) {
781  return x / scale * scale;
782  }
783  if (!(x % scale)) {
784  return x;
785  }
786  return x / scale * scale - scale;
787 }
788 
789 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x,
790  const int64_t scale) {
791  return decimal_floor(x, scale) + (x % scale ? scale : 0);
792 }
793 
794 // Shared memory aggregators. Should never be called,
795 // real implementations are in cuda_mapd_rt.cu.
796 #define DEF_SHARED_AGG_RET_STUBS(base_agg_func) \
797  extern "C" GPU_RT_STUB uint64_t base_agg_func##_shared(uint64_t* agg, \
798  const int64_t val) { \
799  return 0; \
800  } \
801  \
802  extern "C" GPU_RT_STUB uint64_t base_agg_func##_skip_val_shared( \
803  uint64_t* agg, const int64_t val, const int64_t skip_val) { \
804  return 0; \
805  } \
806  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_shared(uint32_t* agg, \
807  const int32_t val) { \
808  return 0; \
809  } \
810  \
811  extern "C" GPU_RT_STUB uint32_t base_agg_func##_int32_skip_val_shared( \
812  uint32_t* agg, const int32_t val, const int32_t skip_val) { \
813  return 0; \
814  } \
815  \
816  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_shared(uint64_t* agg, \
817  const double val) { \
818  return 0; \
819  } \
820  \
821  extern "C" GPU_RT_STUB uint64_t base_agg_func##_double_skip_val_shared( \
822  uint64_t* agg, const double val, const double skip_val) { \
823  return 0; \
824  } \
825  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_shared(uint32_t* agg, \
826  const float val) { \
827  return 0; \
828  } \
829  \
830  extern "C" GPU_RT_STUB uint32_t base_agg_func##_float_skip_val_shared( \
831  uint32_t* agg, const float val, const float skip_val) { \
832  return 0; \
833  }
834 
835 #define DEF_SHARED_AGG_STUBS(base_agg_func) \
836  extern "C" GPU_RT_STUB void base_agg_func##_shared(int64_t* agg, const int64_t val) {} \
837  \
838  extern "C" GPU_RT_STUB void base_agg_func##_skip_val_shared( \
839  int64_t* agg, const int64_t val, const int64_t skip_val) {} \
840  extern "C" GPU_RT_STUB void base_agg_func##_int32_shared(int32_t* agg, \
841  const int32_t val) {} \
842  extern "C" GPU_RT_STUB void base_agg_func##_int16_shared(int16_t* agg, \
843  const int16_t val) {} \
844  extern "C" GPU_RT_STUB void base_agg_func##_int8_shared(int8_t* agg, \
845  const int8_t val) {} \
846  \
847  extern "C" GPU_RT_STUB void base_agg_func##_int32_skip_val_shared( \
848  int32_t* agg, const int32_t val, const int32_t skip_val) {} \
849  \
850  extern "C" GPU_RT_STUB void base_agg_func##_double_shared(int64_t* agg, \
851  const double val) {} \
852  \
853  extern "C" GPU_RT_STUB void base_agg_func##_double_skip_val_shared( \
854  int64_t* agg, const double val, const double skip_val) {} \
855  extern "C" GPU_RT_STUB void base_agg_func##_float_shared(int32_t* agg, \
856  const float val) {} \
857  \
858  extern "C" GPU_RT_STUB void base_agg_func##_float_skip_val_shared( \
859  int32_t* agg, const float val, const float skip_val) {}
860 
865 
866 extern "C" GPU_RT_STUB int8_t* agg_id_varlen_shared(int8_t* varlen_buffer,
867  const int64_t offset,
868  const int8_t* value,
869  const int64_t size_bytes) {
870  return nullptr;
871 }
872 
873 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_shared(int64_t* agg,
874  const int64_t val,
875  const int64_t null_val) {
876  return 0;
877 }
878 
879 extern "C" GPU_RT_STUB int32_t
881  const int32_t val,
882  const int32_t null_val) {
883  return 0;
884 }
885 extern "C" GPU_RT_STUB int32_t
887  const int16_t val,
888  const int16_t null_val) {
889  return 0;
890 }
891 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t* agg,
892  const int8_t val,
893  const int8_t null_val) {
894  return 0;
895 }
896 
897 extern "C" GPU_RT_STUB int32_t
899  const double val,
900  const double null_val) {
901  return 0;
902 }
903 
904 extern "C" GPU_RT_STUB int32_t checked_single_agg_id_float_shared(int32_t* agg,
905  const float val,
906  const float null_val) {
907  return 0;
908 }
909 
910 extern "C" GPU_RT_STUB void agg_max_int16_skip_val_shared(int16_t* agg,
911  const int16_t val,
912  const int16_t skip_val) {}
913 
914 extern "C" GPU_RT_STUB void agg_max_int8_skip_val_shared(int8_t* agg,
915  const int8_t val,
916  const int8_t skip_val) {}
917 
918 extern "C" GPU_RT_STUB void agg_min_int16_skip_val_shared(int16_t* agg,
919  const int16_t val,
920  const int16_t skip_val) {}
921 
922 extern "C" GPU_RT_STUB void agg_min_int8_skip_val_shared(int8_t* agg,
923  const int8_t val,
924  const int8_t skip_val) {}
925 
926 extern "C" GPU_RT_STUB void agg_id_double_shared_slow(int64_t* agg, const double* val) {}
927 
928 extern "C" GPU_RT_STUB int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
929  return 0;
930 }
931 
932 extern "C" GPU_RT_STUB int64_t agg_sum_skip_val_shared(int64_t* agg,
933  const int64_t val,
934  const int64_t skip_val) {
935  return 0;
936 }
937 extern "C" GPU_RT_STUB int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
938  return 0;
939 }
940 
941 extern "C" GPU_RT_STUB int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
942  const int32_t val,
943  const int32_t skip_val) {
944  return 0;
945 }
946 
947 extern "C" GPU_RT_STUB void agg_sum_double_shared(int64_t* agg, const double val) {}
948 
949 extern "C" GPU_RT_STUB void agg_sum_double_skip_val_shared(int64_t* agg,
950  const double val,
951  const double skip_val) {}
952 extern "C" GPU_RT_STUB void agg_sum_float_shared(int32_t* agg, const float val) {}
953 
954 extern "C" GPU_RT_STUB void agg_sum_float_skip_val_shared(int32_t* agg,
955  const float val,
956  const float skip_val) {}
957 
958 extern "C" GPU_RT_STUB void force_sync() {}
959 
960 extern "C" GPU_RT_STUB void sync_warp() {}
961 extern "C" GPU_RT_STUB void sync_warp_protected(int64_t thread_pos, int64_t row_count) {}
962 extern "C" GPU_RT_STUB void sync_threadblock() {}
963 
964 extern "C" GPU_RT_STUB void write_back_non_grouped_agg(int64_t* input_buffer,
965  int64_t* output_buffer,
966  const int32_t num_agg_cols){};
967 // x64 stride functions
968 
969 extern "C" RUNTIME_EXPORT NEVER_INLINE int32_t pos_start_impl(int32_t* error_code) {
970  int32_t row_index_resume{0};
971  if (error_code) {
972  row_index_resume = error_code[0];
973  error_code[0] = 0;
974  }
975  return row_index_resume;
976 }
977 
979  return pos_start_impl(nullptr);
980 }
981 
983  return 1;
984 }
985 
986 extern "C" GPU_RT_STUB int8_t thread_warp_idx(const int8_t warp_sz) {
987  return 0;
988 }
989 
990 extern "C" GPU_RT_STUB int64_t get_thread_index() {
991  return 0;
992 }
993 
995  return nullptr;
996 }
997 
998 extern "C" GPU_RT_STUB int64_t get_block_index() {
999  return 0;
1000 }
1001 
1002 #undef GPU_RT_STUB
1003 
1004 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code,
1005  int32_t* error_codes) {
1006  // NB: never override persistent error codes (with code greater than zero).
1007  // On GPU, a projection query with a limit can run out of slots without it
1008  // being an actual error if the limit has been hit. If a persistent error
1009  // (division by zero, for example) occurs before running out of slots, we
1010  // have to avoid overriding it, because there's a risk that the query would
1011  // go through if we override with a potentially benign out-of-slots code.
1012  if (err_code && error_codes[pos_start_impl(nullptr)] <= 0) {
1013  error_codes[pos_start_impl(nullptr)] = err_code;
1014  }
1015 }
1016 
1017 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t* error_codes) {
1018  return error_codes[pos_start_impl(nullptr)];
1019 }
1020 
1021 // group by helpers
1022 
1024  const int64_t* groups_buffer,
1025  const int32_t groups_buffer_size) {
1026  return groups_buffer;
1027 }
1028 
1030  int64_t* src,
1031  const int32_t sz) {
1032 #ifndef _WIN32
1033  // the body is not really needed, just make sure the call is not optimized away
1034  assert(dest);
1035 #endif
1036 }
1037 
1038 extern "C" RUNTIME_EXPORT int64_t* init_shared_mem(const int64_t* global_groups_buffer,
1039  const int32_t groups_buffer_size) {
1040  return nullptr;
1041 }
1042 
1044  int64_t* groups_buffer,
1045  const int64_t* init_vals,
1046  const uint32_t groups_buffer_entry_count,
1047  const uint32_t key_qw_count,
1048  const uint32_t agg_col_count,
1049  const bool keyless,
1050  const int8_t warp_size) {
1051 #ifndef _WIN32
1052  // the body is not really needed, just make sure the call is not optimized away
1053  assert(groups_buffer);
1054 #endif
1055 }
1056 
1058  int64_t* groups_buffer,
1059  const int64_t* init_vals,
1060  const uint32_t groups_buffer_entry_count,
1061  const uint32_t key_qw_count,
1062  const uint32_t agg_col_count,
1063  const bool keyless,
1064  const bool blocks_share_memory,
1065  const int32_t frag_idx) {
1066 #ifndef _WIN32
1067  // the body is not really needed, just make sure the call is not optimized away
1068  assert(groups_buffer);
1069 #endif
1070 }
1071 
1073  int64_t* groups_buffer,
1074  const int64_t* init_vals,
1075  const uint32_t groups_buffer_entry_count,
1076  const uint32_t key_qw_count,
1077  const uint32_t agg_col_count,
1078  const bool keyless,
1079  const int8_t warp_size) {
1080 #ifndef _WIN32
1081  // the body is not really needed, just make sure the call is not optimized away
1082  assert(groups_buffer);
1083 #endif
1084 }
1085 
1086 template <typename T>
1087 ALWAYS_INLINE int64_t* get_matching_group_value(int64_t* groups_buffer,
1088  const uint32_t h,
1089  const T* key,
1090  const uint32_t key_count,
1091  const uint32_t row_size_quad) {
1092  auto off = h * row_size_quad;
1093  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
1094  if (*row_ptr == get_empty_key<T>()) {
1095  memcpy(row_ptr, key, key_count * sizeof(T));
1096  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1097  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1098  }
1099  if (memcmp(row_ptr, key, key_count * sizeof(T)) == 0) {
1100  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
1101  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
1102  }
1103  return nullptr;
1104 }
1105 
1107  int64_t* groups_buffer,
1108  const uint32_t h,
1109  const int64_t* key,
1110  const uint32_t key_count,
1111  const uint32_t key_width,
1112  const uint32_t row_size_quad) {
1113  switch (key_width) {
1114  case 4:
1115  return get_matching_group_value(groups_buffer,
1116  h,
1117  reinterpret_cast<const int32_t*>(key),
1118  key_count,
1119  row_size_quad);
1120  case 8:
1121  return get_matching_group_value(groups_buffer, h, key, key_count, row_size_quad);
1122  default:;
1123  }
1124  return nullptr;
1125 }
1126 
1127 template <typename T>
1129  const uint32_t entry_count,
1130  const uint32_t h,
1131  const T* key,
1132  const uint32_t key_count) {
1133  auto off = h;
1134  auto key_buffer = reinterpret_cast<T*>(groups_buffer);
1135  if (key_buffer[off] == get_empty_key<T>()) {
1136  for (size_t i = 0; i < key_count; ++i) {
1137  key_buffer[off] = key[i];
1138  off += entry_count;
1139  }
1140  return h;
1141  }
1142  off = h;
1143  for (size_t i = 0; i < key_count; ++i) {
1144  if (key_buffer[off] != key[i]) {
1145  return -1;
1146  }
1147  off += entry_count;
1148  }
1149  return h;
1150 }
1151 
1152 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1154  const uint32_t entry_count,
1155  const uint32_t h,
1156  const int64_t* key,
1157  const uint32_t key_count,
1158  const uint32_t key_width) {
1159  switch (key_width) {
1160  case 4:
1161  return get_matching_group_value_columnar_slot(groups_buffer,
1162  entry_count,
1163  h,
1164  reinterpret_cast<const int32_t*>(key),
1165  key_count);
1166  case 8:
1168  groups_buffer, entry_count, h, key, key_count);
1169  default:
1170  return -1;
1171  }
1172  return -1;
1173 }
1174 
1176  int64_t* groups_buffer,
1177  const uint32_t h,
1178  const int64_t* key,
1179  const uint32_t key_qw_count,
1180  const size_t entry_count) {
1181  auto off = h;
1182  if (groups_buffer[off] == EMPTY_KEY_64) {
1183  for (size_t i = 0; i < key_qw_count; ++i) {
1184  groups_buffer[off] = key[i];
1185  off += entry_count;
1186  }
1187  return &groups_buffer[off];
1188  }
1189  off = h;
1190  for (size_t i = 0; i < key_qw_count; ++i) {
1191  if (groups_buffer[off] != key[i]) {
1192  return nullptr;
1193  }
1194  off += entry_count;
1195  }
1196  return &groups_buffer[off];
1197 }
1198 
1199 /*
1200  * For a particular hashed_index, returns the row-wise offset
1201  * to the first matching agg column in memory.
1202  * It also checks the corresponding group column, and initialize all
1203  * available keys if they are not empty (it is assumed all group columns are
1204  * 64-bit wide).
1205  *
1206  * Memory layout:
1207  *
1208  * | prepended group columns (64-bit each) | agg columns |
1209  */
1211  int64_t* groups_buffer,
1212  const uint32_t hashed_index,
1213  const int64_t* key,
1214  const uint32_t key_count,
1215  const uint32_t row_size_quad) {
1216  uint32_t off = hashed_index * row_size_quad;
1217  if (groups_buffer[off] == EMPTY_KEY_64) {
1218  for (uint32_t i = 0; i < key_count; ++i) {
1219  groups_buffer[off + i] = key[i];
1220  }
1221  }
1222  return groups_buffer + off + key_count;
1223 }
1224 
1231 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t*
1233  const uint32_t hashed_index,
1234  const uint32_t row_size_quad) {
1235  return groups_buffer + row_size_quad * hashed_index;
1236 }
1237 
1238 /*
1239  * For a particular hashed_index, find and initialize (if necessary) all the group
1240  * columns corresponding to a key. It is assumed that all group columns are 64-bit wide.
1241  */
1242 extern "C" RUNTIME_EXPORT ALWAYS_INLINE void
1244  const uint32_t hashed_index,
1245  const int64_t* key,
1246  const uint32_t key_count,
1247  const uint32_t entry_count) {
1248  if (groups_buffer[hashed_index] == EMPTY_KEY_64) {
1249  for (uint32_t i = 0; i < key_count; i++) {
1250  groups_buffer[i * entry_count + hashed_index] = key[i];
1251  }
1252  }
1253 }
1254 
1255 #include "GeoOpsRuntime.cpp"
1256 #include "GroupByRuntime.cpp"
1258 
1260  int64_t* groups_buffer,
1261  const int64_t key,
1262  const int64_t min_key,
1263  const int64_t /* bucket */,
1264  const uint32_t row_size_quad) {
1265  return groups_buffer + row_size_quad * (key - min_key);
1266 }
1267 
1269  int64_t* groups_buffer,
1270  const int64_t key,
1271  const int64_t min_key,
1272  const int64_t /* bucket */,
1273  const uint32_t row_size_quad,
1274  const uint8_t thread_warp_idx,
1275  const uint8_t warp_size) {
1276  return groups_buffer + row_size_quad * (warp_size * (key - min_key) + thread_warp_idx);
1277 }
1278 
1280  const uint64_t str_and_len) {
1281  return reinterpret_cast<int8_t*>(str_and_len & 0xffffffffffff);
1282 }
1283 
1284 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t
1285 extract_str_len(const uint64_t str_and_len) {
1286  return static_cast<int64_t>(str_and_len) >> 48;
1287 }
1288 
1290  const uint64_t str_and_len) {
1291  return extract_str_ptr(str_and_len);
1292 }
1293 
1294 extern "C" RUNTIME_EXPORT NEVER_INLINE int32_t
1295 extract_str_len_noinline(const uint64_t str_and_len) {
1296  return extract_str_len(str_and_len);
1297 }
1298 
1299 extern "C" RUNTIME_EXPORT ALWAYS_INLINE uint64_t string_pack(const int8_t* ptr,
1300  const int32_t len) {
1301  return (reinterpret_cast<const uint64_t>(ptr) & 0xffffffffffff) |
1302  (static_cast<const uint64_t>(len) << 48);
1303 }
1304 
1305 #ifdef __clang__
1306 #include "../Utils/StringLike.cpp"
1307 #endif
1308 
1309 #ifndef __CUDACC__
1310 #include "TopKRuntime.cpp"
1311 #endif
1312 
1313 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1314 char_length(const char* str, const int32_t str_len) {
1315  return str_len;
1316 }
1317 
1318 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1319 char_length_nullable(const char* str, const int32_t str_len, const int32_t int_null) {
1320  if (!str) {
1321  return int_null;
1322  }
1323  return str_len;
1324 }
1325 
1326 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t
1327 key_for_string_encoded(const int32_t str_id) {
1328  return str_id;
1329 }
1330 
1332  const double proportion,
1333  const int64_t row_offset) {
1334  const int64_t threshold = 4294967296 * proportion;
1335  return (row_offset * 2654435761) % 4294967296 < threshold;
1336 }
1337 
1339  const double target_value,
1340  const double lower_bound,
1341  const double upper_bound,
1342  const double scale_factor,
1343  const int32_t partition_count) {
1344  if (target_value < lower_bound) {
1345  return 0;
1346  } else if (target_value >= upper_bound) {
1347  return partition_count + 1;
1348  }
1349  return ((target_value - lower_bound) * scale_factor) + 1;
1350 }
1351 
1353  const double target_value,
1354  const double lower_bound,
1355  const double upper_bound,
1356  const double scale_factor,
1357  const int32_t partition_count) {
1358  if (target_value > lower_bound) {
1359  return 0;
1360  } else if (target_value <= upper_bound) {
1361  return partition_count + 1;
1362  }
1363  return ((lower_bound - target_value) * scale_factor) + 1;
1364 }
1365 
1367  const double target_value,
1368  const double lower_bound,
1369  const double upper_bound,
1370  const double scale_factor,
1371  const int32_t partition_count,
1372  const double null_val) {
1373  if (target_value == null_val) {
1374  return INT32_MIN;
1375  }
1376  return width_bucket(
1377  target_value, lower_bound, upper_bound, scale_factor, partition_count);
1378 }
1379 
1381  const double target_value,
1382  const double lower_bound,
1383  const double upper_bound,
1384  const double scale_factor,
1385  const int32_t partition_count,
1386  const double null_val) {
1387  if (target_value == null_val) {
1388  return INT32_MIN;
1389  }
1390  return width_bucket_reversed(
1391  target_value, lower_bound, upper_bound, scale_factor, partition_count);
1392 }
1393 
1394 // width_bucket with no out-of-bound check version which can be called
1395 // if we can assure the input target_value expr always resides in the valid range
1396 // (so we can also avoid null checking)
1398  const double target_value,
1399  const double lower_bound,
1400  const double scale_factor) {
1401  return ((target_value - lower_bound) * scale_factor) + 1;
1402 }
1403 
1405  const double target_value,
1406  const double lower_bound,
1407  const double scale_factor) {
1408  return ((lower_bound - target_value) * scale_factor) + 1;
1409 }
1410 
1412  const double target_value,
1413  const bool reversed,
1414  const double lower_bound,
1415  const double upper_bound,
1416  const int32_t partition_count) {
1417  if (reversed) {
1418  return width_bucket_reversed(target_value,
1419  lower_bound,
1420  upper_bound,
1421  partition_count / (lower_bound - upper_bound),
1422  partition_count);
1423  }
1424  return width_bucket(target_value,
1425  lower_bound,
1426  upper_bound,
1427  partition_count / (upper_bound - lower_bound),
1428  partition_count);
1429 }
1430 
1432  const double target_value,
1433  const bool reversed,
1434  const double lower_bound,
1435  const double upper_bound,
1436  const int32_t partition_count,
1437  const double null_val) {
1438  if (target_value == null_val) {
1439  return INT32_MIN;
1440  }
1441  return width_bucket_expr(
1442  target_value, reversed, lower_bound, upper_bound, partition_count);
1443 }
1444 
1446  const double target_value,
1447  const bool reversed,
1448  const double lower_bound,
1449  const double upper_bound,
1450  const int32_t partition_count) {
1451  if (reversed) {
1453  target_value, lower_bound, partition_count / (lower_bound - upper_bound));
1454  }
1456  target_value, lower_bound, partition_count / (upper_bound - lower_bound));
1457 }
1458 
1459 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int64_t
1460 row_number_window_func(const int64_t output_buff, const int64_t pos) {
1461  return reinterpret_cast<const int64_t*>(output_buff)[pos];
1462 }
1463 
1465  const int64_t output_buff,
1466  const int64_t pos) {
1467  return reinterpret_cast<const double*>(output_buff)[pos];
1468 }
1469 
1470 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t* agg) {
1471  return *reinterpret_cast<const double*>(may_alias_ptr(agg));
1472 }
1473 
1474 extern "C" RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t* agg) {
1475  return *reinterpret_cast<const float*>(may_alias_ptr(agg));
1476 }
1477 
1478 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t* sum,
1479  const int64_t* count,
1480  const double null_val) {
1481  return *count != 0 ? static_cast<double>(*sum) / *count : null_val;
1482 }
1483 
1484 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t* sum,
1485  const int64_t* count,
1486  const double null_val,
1487  const uint32_t scale) {
1488  return *count != 0 ? (static_cast<double>(*sum) / pow(10, scale)) / *count : null_val;
1489 }
1490 
1491 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t* agg,
1492  const int64_t* count,
1493  const double null_val) {
1494  return *count != 0 ? *reinterpret_cast<const double*>(may_alias_ptr(agg)) / *count
1495  : null_val;
1496 }
1497 
1498 extern "C" RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t* agg,
1499  const int32_t* count,
1500  const double null_val) {
1501  return *count != 0 ? *reinterpret_cast<const float*>(may_alias_ptr(agg)) / *count
1502  : null_val;
1503 }
1504 
1506  uint8_t* bitmap,
1507  const uint32_t bitmap_bytes,
1508  const uint8_t* key_bytes,
1509  const uint32_t key_len) {
1510  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1511  const uint32_t word_idx = bit_pos / 32;
1512  const uint32_t bit_idx = bit_pos % 32;
1513  reinterpret_cast<uint32_t*>(bitmap)[word_idx] |= 1 << bit_idx;
1514 }
1515 
1517  const int8_t** col_buffers,
1518  const int8_t* literals,
1519  const int64_t* num_rows,
1520  const uint64_t* frag_row_offsets,
1521  const int32_t* max_matched,
1522  const int64_t* init_agg_value,
1523  int64_t** out,
1524  uint32_t frag_idx,
1525  const int64_t* join_hash_tables,
1526  int32_t* error_code,
1527  int32_t* total_matched) {
1528 #ifndef _WIN32
1529  assert(col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
1530  init_agg_value || out || frag_idx || error_code || join_hash_tables ||
1531  total_matched);
1532 #endif
1533 }
1534 
1536  const int8_t*** col_buffers,
1537  const uint64_t* num_fragments,
1538  const int8_t* literals,
1539  const int64_t* num_rows,
1540  const uint64_t* frag_row_offsets,
1541  const int32_t* max_matched,
1542  int32_t* total_matched,
1543  const int64_t* init_agg_value,
1544  int64_t** out,
1545  int32_t* error_code,
1546  const uint32_t* num_tables_ptr,
1547  const int64_t* join_hash_tables) {
1548  for (uint32_t i = 0; i < *num_fragments; ++i) {
1549  query_stub_hoisted_literals(col_buffers ? col_buffers[i] : nullptr,
1550  literals,
1551  &num_rows[i * (*num_tables_ptr)],
1552  &frag_row_offsets[i * (*num_tables_ptr)],
1553  max_matched,
1554  init_agg_value,
1555  out,
1556  i,
1557  join_hash_tables,
1558  total_matched,
1559  error_code);
1560  }
1561 }
1562 
1563 extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(const int8_t** col_buffers,
1564  const int64_t* num_rows,
1565  const uint64_t* frag_row_offsets,
1566  const int32_t* max_matched,
1567  const int64_t* init_agg_value,
1568  int64_t** out,
1569  uint32_t frag_idx,
1570  const int64_t* join_hash_tables,
1571  int32_t* error_code,
1572  int32_t* total_matched) {
1573 #ifndef _WIN32
1574  assert(col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
1575  out || frag_idx || error_code || join_hash_tables || total_matched);
1576 #endif
1577 }
1578 
1579 extern "C" RUNTIME_EXPORT void multifrag_query(const int8_t*** col_buffers,
1580  const uint64_t* num_fragments,
1581  const int64_t* num_rows,
1582  const uint64_t* frag_row_offsets,
1583  const int32_t* max_matched,
1584  int32_t* total_matched,
1585  const int64_t* init_agg_value,
1586  int64_t** out,
1587  int32_t* error_code,
1588  const uint32_t* num_tables_ptr,
1589  const int64_t* join_hash_tables) {
1590  for (uint32_t i = 0; i < *num_fragments; ++i) {
1591  query_stub(col_buffers ? col_buffers[i] : nullptr,
1592  &num_rows[i * (*num_tables_ptr)],
1593  &frag_row_offsets[i * (*num_tables_ptr)],
1594  max_matched,
1595  init_agg_value,
1596  out,
1597  i,
1598  join_hash_tables,
1599  total_matched,
1600  error_code);
1601  }
1602 }
1603 
1605  if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
1606  return true;
1607  }
1608  return false;
1609 }
1610 
1611 extern "C" RUNTIME_EXPORT bool check_interrupt_init(unsigned command) {
1612  static std::atomic_bool runtime_interrupt_flag{false};
1613 
1614  if (command == static_cast<unsigned>(INT_CHECK)) {
1615  if (runtime_interrupt_flag.load()) {
1616  return true;
1617  }
1618  return false;
1619  }
1620  if (command == static_cast<unsigned>(INT_ABORT)) {
1621  runtime_interrupt_flag.store(true);
1622  return false;
1623  }
1624  if (command == static_cast<unsigned>(INT_RESET)) {
1625  runtime_interrupt_flag.store(false);
1626  return false;
1627  }
1628  return false;
1629 }
DEVICE auto upper_bound(ARGS &&...args)
Definition: gpu_enabled.h:123
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
RUNTIME_EXPORT void agg_min_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap_skip_val(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val)
#define DEF_UMINUS_NULLABLE(type, null_type)
GPU_RT_STUB int32_t checked_single_agg_id_int32_shared(int32_t *agg, const int32_t val, const int32_t null_val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_CHECKED_SINGLE_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE int8_t * agg_id_varlen(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
RUNTIME_EXPORT void agg_max_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_not_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
#define EMPTY_KEY_64
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t row_size_quad)
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:49
RUNTIME_EXPORT void agg_min_int16(int16_t *agg, const int16_t val)
#define GPU_RT_STUB
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_nullable_lhs(const int64_t dividend, const int64_t divisor, const int64_t null_val)
RUNTIME_EXPORT void agg_max_int16(int16_t *agg, const int16_t val)
#define DEF_CAST_NULLABLE_BIDIR(type1, type2)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32_skip_val(uint32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT NEVER_INLINE void agg_approximate_count_distinct(int64_t *agg, const int64_t key, const uint32_t b)
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:39
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE int32_t extract_str_len(const uint64_t str_and_len)
#define DEF_CAST_NULLABLE(from_type, to_type)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_expr_no_oob_check(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_skip_val(uint64_t *agg, const int64_t val, const int64_t skip_val)
__device__ int64_t get_thread_index()
Definition: cuda_mapd_rt.cu:19
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
RUNTIME_EXPORT ALWAYS_INLINE int8_t * extract_str_ptr(const uint64_t str_and_len)
#define const
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_float(int32_t *agg, const float val)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:35
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
__device__ void write_back_non_grouped_agg(int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_int(const int64_t *sum, const int64_t *count, const double null_val)
__device__ int32_t checked_single_agg_id_double_shared(int64_t *agg, const double val, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:43
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id(int64_t *agg, const int64_t val, const int64_t null_val)
#define DEF_ARITH_NULLABLE_RHS(type, null_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_reversed_no_oob_check(const double target_value, const double lower_bound, const double scale_factor)
#define DEF_AGG_MAX_INT(n)
__device__ int32_t checked_single_agg_id_float_shared(int32_t *agg, const float val, const float null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_count_distinct_bitmap(int64_t *agg, const int64_t val, const int64_t min_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_and(const int8_t lhs, const int8_t rhs, const int8_t null_val)
#define DEF_CAST_SCALED_NULLABLE(from_type, to_type)
RUNTIME_EXPORT NEVER_INLINE void query_stub(const int8_t **col_buffers, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t floor_div_lhs(const int64_t dividend, const int64_t divisor)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t key_for_string_encoded(const int32_t str_id)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_matching_group_value_perfect_hash_keyless(int64_t *groups_buffer, const uint32_t hashed_index, const uint32_t row_size_quad)
__device__ int8_t * agg_id_varlen_shared(int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
__device__ int64_t * declare_dynamic_shared_memory()
Definition: cuda_mapd_rt.cu:56
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
#define DEF_ROUND_NULLABLE(from_type, to_type)
#define DEVICE
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_reversed(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count)
#define DEF_SKIP_AGG(base_agg_func)
__device__ int64_t get_block_index()
Definition: cuda_mapd_rt.cu:23
__device__ bool check_interrupt()
#define DEF_WRITE_PROJECTION_INT(n)
GPU_RT_STUB int32_t checked_single_agg_id_int8_shared(int8_t *agg, const int8_t val, const int8_t null_val)
RUNTIME_EXPORT NEVER_INLINE int32_t extract_str_len_noinline(const uint64_t str_and_len)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_floor(const int64_t x, const int64_t scale)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
RUNTIME_EXPORT ALWAYS_INLINE double width_bucket_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
#define DEF_SHARED_AGG_RET_STUBS(base_agg_func)
#define INT32_MIN
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
int count
RUNTIME_EXPORT ALWAYS_INLINE void agg_min_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t decimal_ceil(const int64_t x, const int64_t scale)
#define DEF_ARITH_NULLABLE_LHS(type, null_type, opname, opsym)
RUNTIME_EXPORT ALWAYS_INLINE double width_bucket_reversed_nullable(const double target_value, const double lower_bound, const double upper_bound, const double scale_factor, const int32_t partition_count, const double null_val)
#define DEF_AGG_MIN_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count_double_skip_val(uint64_t *agg, const double val, const double skip_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_min(int64_t *agg, const int64_t val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_not(const int8_t operand, const int8_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE float load_float(const int32_t *agg)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_double(int64_t *agg, const double val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t row_number_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT NEVER_INLINE void init_columnar_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const bool blocks_share_memory, const int32_t frag_idx)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_down_nullable(const int64_t operand, const int64_t scale, const int64_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id(int64_t *agg, const int64_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t scale_decimal_up(const int64_t operand, const uint64_t scale, const int64_t operand_null_val, const int64_t result_null_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length(const char *str, const int32_t str_len)
#define RUNTIME_EXPORT
RUNTIME_EXPORT ALWAYS_INLINE uint64_t string_pack(const int8_t *ptr, const int32_t len)
RUNTIME_EXPORT ALWAYS_INLINE int8_t bit_is_set(const int64_t bitset, const int64_t val, const int64_t min_val, const int64_t max_val, const int64_t null_val, const int8_t null_bool_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_float(int32_t *agg, const float val, const float null_val)
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_double(const int64_t *agg)
RUNTIME_EXPORT NEVER_INLINE int8_t * extract_str_ptr_noinline(const uint64_t str_and_len)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_expr_nullable(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE void agg_id_double(int64_t *agg, const double val)
DEVICE auto lower_bound(ARGS &&...args)
Definition: gpu_enabled.h:78
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
RUNTIME_EXPORT ALWAYS_INLINE DEVICE double width_bucket_expr(const double target_value, const bool reversed, const double lower_bound, const double upper_bound, const int32_t partition_count)
RUNTIME_EXPORT void agg_max_int8(int8_t *agg, const int8_t val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t * get_group_value_fast_keyless_semiprivate(int64_t *groups_buffer, const int64_t key, const int64_t min_key, const int64_t, const uint32_t row_size_quad, const uint8_t thread_warp_idx, const uint8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE void agg_max_float(int32_t *agg, const float val)
RUNTIME_EXPORT void multifrag_query_hoisted_literals(const int8_t ***col_buffers, const uint64_t *num_fragments, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
__device__ const int64_t * init_shared_mem(const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:66
RUNTIME_EXPORT ALWAYS_INLINE int32_t checked_single_agg_id_double(int64_t *agg, const double val, const double null_val)
GPU_RT_STUB int32_t checked_single_agg_id_int16_shared(int16_t *agg, const int16_t val, const int16_t null_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_float(const int32_t *agg, const int32_t *count, const double null_val)
#define DEF_BINARY_NULLABLE_ALL_OPS(type, null_type)
RUNTIME_EXPORT NEVER_INLINE void init_group_by_buffer_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_qw_count, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
RUNTIME_EXPORT ALWAYS_INLINE void set_matching_group_value_perfect_hash_columnar(int64_t *groups_buffer, const uint32_t hashed_index, const int64_t *key, const uint32_t key_count, const uint32_t entry_count)
#define NEVER_INLINE
#define DEF_ARITH_NULLABLE(type, null_type, opname, opsym)
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void sync_threadblock()
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE int32_t agg_sum_int32_skip_val(int32_t *agg, const int32_t val, const int32_t skip_val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_double(const int64_t *agg, const int64_t *count, const double null_val)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
RUNTIME_EXPORT ALWAYS_INLINE void agg_sum_float(int32_t *agg, const float val)
RUNTIME_EXPORT ALWAYS_INLINE double load_avg_decimal(const int64_t *sum, const int64_t *count, const double null_val, const uint32_t scale)
RUNTIME_EXPORT void multifrag_query(const int8_t ***col_buffers, const uint64_t *num_fragments, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, int32_t *total_matched, const int64_t *init_agg_value, int64_t **out, int32_t *error_code, const uint32_t *num_tables_ptr, const int64_t *join_hash_tables)
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t *error_codes)
RUNTIME_EXPORT ALWAYS_INLINE int64_t agg_sum(int64_t *agg, const int64_t val)
#define DEF_SHARED_AGG_STUBS(base_agg_func)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ int32_t checked_single_agg_id_shared(int64_t *agg, const int64_t val, const int64_t null_val)
RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(const int8_t **col_buffers, const int8_t *literals, const int64_t *num_rows, const uint64_t *frag_row_offsets, const int32_t *max_matched, const int64_t *init_agg_value, int64_t **out, uint32_t frag_idx, const int64_t *join_hash_tables, int32_t *error_code, int32_t *total_matched)
#define ALWAYS_INLINE
#define DEF_AGG_ID_INT(n)
RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err_code, int32_t *error_codes)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool sample_ratio(const double proportion, const int64_t row_offset)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
RUNTIME_EXPORT ALWAYS_INLINE double percent_window_func(const int64_t output_buff, const int64_t pos)
RUNTIME_EXPORT ALWAYS_INLINE int8_t logical_or(const int8_t lhs, const int8_t rhs, const int8_t null_val)
__device__ void force_sync()
RUNTIME_EXPORT void agg_min_int32(int32_t *agg, const int32_t val)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float_skip_val(uint32_t *agg, const float val, const float skip_val)
RUNTIME_EXPORT ALWAYS_INLINE DEVICE int32_t char_length_nullable(const char *str, const int32_t str_len, const int32_t int_null)
RUNTIME_EXPORT ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:31