OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_mapd_rt.cu
Go to the documentation of this file.
1 #include <cuda.h>
2 #include <float.h>
3 #include <stdint.h>
4 #include <limits>
5 #include "BufferCompaction.h"
6 #include "ExtensionFunctions.hpp"
7 #include "GpuRtConstants.h"
8 #include "HyperLogLogRank.h"
10 
11 extern "C" __device__ int32_t pos_start_impl(const int32_t* row_index_resume) {
12  return blockIdx.x * blockDim.x + threadIdx.x;
13 }
14 
15 extern "C" __device__ int32_t group_buff_idx_impl() {
16  return pos_start_impl(NULL);
17 }
18 
19 extern "C" __device__ int32_t pos_step_impl() {
20  return blockDim.x * gridDim.x;
21 }
22 
23 extern "C" __device__ int8_t thread_warp_idx(const int8_t warp_sz) {
24  return threadIdx.x % warp_sz;
25 }
26 
27 extern "C" __device__ const int64_t* init_shared_mem_nop(
28  const int64_t* groups_buffer,
29  const int32_t groups_buffer_size) {
30  return groups_buffer;
31 }
32 
33 extern "C" __device__ void write_back_nop(int64_t* dest, int64_t* src, const int32_t sz) {
34 }
35 
36 extern "C" __device__ const int64_t* init_shared_mem(const int64_t* groups_buffer,
37  const int32_t groups_buffer_size) {
38  extern __shared__ int64_t fast_bins[];
39  if (threadIdx.x == 0) {
40  memcpy(fast_bins, groups_buffer, groups_buffer_size);
41  }
42  __syncthreads();
43  return fast_bins;
44 }
45 
51 extern "C" __device__ int64_t* alloc_shared_mem_dynamic() {
52  extern __shared__ int64_t groups_buffer_smem[];
53  return groups_buffer_smem;
54 }
55 
61 extern "C" __device__ void set_shared_mem_to_identity(
62  int64_t* groups_buffer_smem,
63  const int32_t groups_buffer_size,
64  const int64_t identity_element = 0) {
65 #pragma unroll
66  for (int i = threadIdx.x; i < groups_buffer_size; i += blockDim.x) {
67  groups_buffer_smem[i] = identity_element;
68  }
69  __syncthreads();
70 }
71 
77 extern "C" __device__ const int64_t* init_shared_mem_dynamic(
78  const int64_t* groups_buffer,
79  const int32_t groups_buffer_size) {
80  int64_t* groups_buffer_smem = alloc_shared_mem_dynamic();
81  set_shared_mem_to_identity(groups_buffer_smem, groups_buffer_size);
82  return groups_buffer_smem;
83 }
84 
85 extern "C" __device__ void write_back(int64_t* dest, int64_t* src, const int32_t sz) {
86  __syncthreads();
87  if (threadIdx.x == 0) {
88  memcpy(dest, src, sz);
89  }
90 }
91 
92 extern "C" __device__ void write_back_smem_nop(int64_t* dest,
93  int64_t* src,
94  const int32_t sz) {}
95 
96 extern "C" __device__ void agg_from_smem_to_gmem_nop(int64_t* gmem_dest,
97  int64_t* smem_src,
98  const int32_t num_elements) {}
99 
107 extern "C" __device__ void agg_from_smem_to_gmem_binId_count(int64_t* gmem_dest,
108  int64_t* smem_src,
109  const int32_t num_elements) {
110  __syncthreads();
111 #pragma unroll
112  for (int i = threadIdx.x; i < num_elements; i += blockDim.x) {
113  int32_t bin_id = *reinterpret_cast<int32_t*>(smem_src + i);
114  int32_t count_result = *(reinterpret_cast<int32_t*>(smem_src + i) + 1);
115  if (count_result) { // non-zero count
116  atomicAdd(reinterpret_cast<unsigned int*>(gmem_dest + i) + 1,
117  static_cast<int32_t>(count_result));
118  // writing back the binId, only if count_result is non-zero
119  *reinterpret_cast<unsigned int*>(gmem_dest + i) = static_cast<int32_t>(bin_id);
120  }
121  }
122 }
123 
131 extern "C" __device__ void agg_from_smem_to_gmem_count_binId(int64_t* gmem_dest,
132  int64_t* smem_src,
133  const int32_t num_elements) {
134  __syncthreads();
135 #pragma unroll
136  for (int i = threadIdx.x; i < num_elements; i += blockDim.x) {
137  int32_t count_result = *reinterpret_cast<int32_t*>(smem_src + i);
138  int32_t bin_id = *(reinterpret_cast<int32_t*>(smem_src + i) + 1);
139  if (count_result) { // non-zero count
140  atomicAdd(reinterpret_cast<unsigned int*>(gmem_dest + i),
141  static_cast<int32_t>(count_result));
142  // writing back the binId, only if count_result is non-zero
143  *(reinterpret_cast<unsigned int*>(gmem_dest + i) + 1) =
144  static_cast<int32_t>(bin_id);
145  }
146  }
147 }
148 
149 #define init_group_by_buffer_gpu_impl init_group_by_buffer_gpu
150 
151 #include "GpuInitGroups.cu"
152 
153 #undef init_group_by_buffer_gpu_impl
154 
155 // Dynamic watchdog: monitoring up to 64 SMs. E.g. GP100 config may have 60:
156 // 6 Graphics Processing Clusters (GPCs) * 10 Streaming Multiprocessors
157 // TODO(Saman): move these into a kernel parameter, allocated and initialized through CUDA
158 __device__ int64_t dw_sm_cycle_start[128]; // Set from host before launching the kernel
159 // TODO(Saman): make this cycle budget something constant in codegen level
160 __device__ int64_t dw_cycle_budget = 0; // Set from host before launching the kernel
161 __device__ int32_t dw_abort = 0; // TBD: set from host (async)
162 
163 __inline__ __device__ uint32_t get_smid(void) {
164  uint32_t ret;
165  asm("mov.u32 %0, %%smid;" : "=r"(ret));
166  return ret;
167 }
168 
169 /*
170  * The main objective of this funciton is to return true, if any of the following two
171  * scnearios happen:
172  * 1. receives a host request for aborting the kernel execution
173  * 2. kernel execution takes longer clock cycles than it was initially allowed
174  * The assumption is that all (or none) threads within a block return true for the
175  * watchdog, and the first thread within each block compares the recorded clock cycles for
176  * its occupying SM with the allowed budget. It also assumess that all threads entering
177  * this function are active (no critical edge exposure)
178  * NOTE: dw_cycle_budget, dw_abort, and dw_sm_cycle_start[] are all variables in global
179  * memory scope.
180  */
181 extern "C" __device__ bool dynamic_watchdog() {
182  // check for dynamic watchdog, if triggered all threads return true
183  if (dw_cycle_budget == 0LL) {
184  return false; // Uninitialized watchdog can't check time
185  }
186  if (dw_abort == 1) {
187  return true; // Received host request to abort
188  }
189  uint32_t smid = get_smid();
190  if (smid >= 128) {
191  return false;
192  }
193  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
194  __shared__ volatile bool
195  dw_should_terminate; // all threads within a block should return together if
196  // watchdog criteria is met
197 
198  // thread 0 either initializes or read the initial clock cycle, the result is stored
199  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
200  // point in using more threads here.
201  if (threadIdx.x == 0) {
202  dw_block_cycle_start = 0LL;
203  int64_t cycle_count = static_cast<int64_t>(clock64());
204  // Make sure the block hasn't switched SMs
205  if (smid == get_smid()) {
206  dw_block_cycle_start = static_cast<int64_t>(
207  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
208  0ULL,
209  static_cast<unsigned long long>(cycle_count)));
210  }
211 
212  int64_t cycles = cycle_count - dw_block_cycle_start;
213  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
214  (cycles > dw_cycle_budget)) {
215  // Check if we're out of time on this particular SM
216  dw_should_terminate = true;
217  } else {
218  dw_should_terminate = false;
219  }
220  }
221  __syncthreads();
222  return dw_should_terminate;
223 }
224 
225 template <typename T = unsigned long long>
226 inline __device__ T get_empty_key() {
227  return EMPTY_KEY_64;
228 }
229 
230 template <>
231 inline __device__ unsigned int get_empty_key() {
232  return EMPTY_KEY_32;
233 }
234 
235 template <typename T>
236 inline __device__ int64_t* get_matching_group_value(int64_t* groups_buffer,
237  const uint32_t h,
238  const T* key,
239  const uint32_t key_count,
240  const uint32_t row_size_quad) {
241  const T empty_key = get_empty_key<T>();
242  uint32_t off = h * row_size_quad;
243  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
244  {
245  const T old = atomicCAS(row_ptr, empty_key, *key);
246  if (empty_key == old && key_count > 1) {
247  for (size_t i = 1; i <= key_count - 1; ++i) {
248  atomicExch(row_ptr + i, key[i]);
249  }
250  }
251  }
252  if (key_count > 1) {
253  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
254  // spin until the winning thread has finished writing the entire key and the init
255  // value
256  }
257  }
258  bool match = true;
259  for (uint32_t i = 0; i < key_count; ++i) {
260  if (row_ptr[i] != key[i]) {
261  match = false;
262  break;
263  }
264  }
265 
266  if (match) {
267  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
268  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
269  }
270  return NULL;
271 }
272 
273 extern "C" __device__ int64_t* get_matching_group_value(int64_t* groups_buffer,
274  const uint32_t h,
275  const int64_t* key,
276  const uint32_t key_count,
277  const uint32_t key_width,
278  const uint32_t row_size_quad,
279  const int64_t* init_vals) {
280  switch (key_width) {
281  case 4:
282  return get_matching_group_value(groups_buffer,
283  h,
284  reinterpret_cast<const unsigned int*>(key),
285  key_count,
286  row_size_quad);
287  case 8:
288  return get_matching_group_value(groups_buffer,
289  h,
290  reinterpret_cast<const unsigned long long*>(key),
291  key_count,
292  row_size_quad);
293  default:
294  return NULL;
295  }
296 }
297 
298 template <typename T>
300  const uint32_t entry_count,
301  const uint32_t h,
302  const T* key,
303  const uint32_t key_count) {
304  const T empty_key = get_empty_key<T>();
305  const uint64_t old =
306  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
307  // the winner thread proceeds with writing the rest fo the keys
308  if (old == empty_key) {
309  uint32_t offset = h + entry_count;
310  for (size_t i = 1; i < key_count; ++i) {
311  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
312  offset += entry_count;
313  }
314  }
315 
316  __threadfence();
317  // for all threads except the winning thread, memory content of the keys
318  // related to the hash offset are checked again. In case of a complete match
319  // the hash offset is returned, otherwise -1 is returned
320  if (old != empty_key) {
321  uint32_t offset = h;
322  for (uint32_t i = 0; i < key_count; ++i) {
323  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
324  return -1;
325  }
326  offset += entry_count;
327  }
328  }
329  return h;
330 }
331 
332 extern "C" __device__ int32_t
334  const uint32_t entry_count,
335  const uint32_t h,
336  const int64_t* key,
337  const uint32_t key_count,
338  const uint32_t key_width) {
339  switch (key_width) {
340  case 4:
342  groups_buffer,
343  entry_count,
344  h,
345  reinterpret_cast<const unsigned int*>(key),
346  key_count);
347  case 8:
349  groups_buffer,
350  entry_count,
351  h,
352  reinterpret_cast<const unsigned long long*>(key),
353  key_count);
354  default:
355  return -1;
356  }
357 }
358 
359 extern "C" __device__ int64_t* get_matching_group_value_columnar(
360  int64_t* groups_buffer,
361  const uint32_t h,
362  const int64_t* key,
363  const uint32_t key_qw_count,
364  const size_t entry_count) {
365  uint32_t off = h;
366  {
367  const uint64_t old = atomicCAS(
368  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
369  if (EMPTY_KEY_64 == old) {
370  for (size_t i = 0; i < key_qw_count; ++i) {
371  groups_buffer[off] = key[i];
372  off += entry_count;
373  }
374  return &groups_buffer[off];
375  }
376  }
377  __syncthreads();
378  off = h;
379  for (size_t i = 0; i < key_qw_count; ++i) {
380  if (groups_buffer[off] != key[i]) {
381  return NULL;
382  }
383  off += entry_count;
384  }
385  return &groups_buffer[off];
386 }
387 
388 #include "GroupByRuntime.cpp"
390 #include "MurmurHash.cpp"
391 #include "TopKRuntime.cpp"
392 
393 __device__ int64_t atomicMax64(int64_t* address, int64_t val) {
394  unsigned long long int* address_as_ull = (unsigned long long int*)address;
395  unsigned long long int old = *address_as_ull, assumed;
396 
397  do {
398  assumed = old;
399  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
400  } while (assumed != old);
401 
402  return old;
403 }
404 
405 __device__ int64_t atomicMin64(int64_t* address, int64_t val) {
406  unsigned long long int* address_as_ull = (unsigned long long int*)address;
407  unsigned long long int old = *address_as_ull, assumed;
408 
409  do {
410  assumed = old;
411  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
412  } while (assumed != old);
413 
414  return old;
415 }
416 
417 // As of 20160418, CUDA 8.0EA only defines `atomicAdd(double*, double)` for compute
418 // capability >= 6.0.
419 #if CUDA_VERSION < 8000 || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600)
420 __device__ double atomicAdd(double* address, double val) {
421  unsigned long long int* address_as_ull = (unsigned long long int*)address;
422  unsigned long long int old = *address_as_ull, assumed;
423 
424  do {
425  assumed = old;
426  old = atomicCAS(address_as_ull,
427  assumed,
428  __double_as_longlong(val + __longlong_as_double(assumed)));
429 
430  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
431  } while (assumed != old);
432 
433  return __longlong_as_double(old);
434 }
435 #endif
436 
437 __device__ double atomicMax(double* address, double val) {
438  unsigned long long int* address_as_ull = (unsigned long long int*)address;
439  unsigned long long int old = *address_as_ull, assumed;
440 
441  do {
442  assumed = old;
443  old = atomicCAS(address_as_ull,
444  assumed,
445  __double_as_longlong(max(val, __longlong_as_double(assumed))));
446 
447  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
448  } while (assumed != old);
449 
450  return __longlong_as_double(old);
451 }
452 
453 __device__ float atomicMax(float* address, float val) {
454  int* address_as_int = (int*)address;
455  int old = *address_as_int, assumed;
456 
457  do {
458  assumed = old;
459  old = atomicCAS(
460  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
461 
462  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
463  } while (assumed != old);
464 
465  return __int_as_float(old);
466 }
467 
468 __device__ double atomicMin(double* address, double val) {
469  unsigned long long int* address_as_ull = (unsigned long long int*)address;
470  unsigned long long int old = *address_as_ull, assumed;
471 
472  do {
473  assumed = old;
474  old = atomicCAS(address_as_ull,
475  assumed,
476  __double_as_longlong(min(val, __longlong_as_double(assumed))));
477  } while (assumed != old);
478 
479  return __longlong_as_double(old);
480 }
481 
482 __device__ double atomicMin(float* address, float val) {
483  int* address_as_ull = (int*)address;
484  int old = *address_as_ull, assumed;
485 
486  do {
487  assumed = old;
488  old = atomicCAS(
489  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
490  } while (assumed != old);
491 
492  return __int_as_float(old);
493 }
494 
495 extern "C" __device__ uint64_t agg_count_shared(uint64_t* agg, const int64_t val) {
496  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1UL));
497 }
498 
499 extern "C" __device__ uint32_t agg_count_int32_shared(uint32_t* agg, const int32_t val) {
500  return atomicAdd(agg, 1UL);
501 }
502 
503 extern "C" __device__ uint64_t agg_count_double_shared(uint64_t* agg, const double val) {
504  return agg_count_shared(agg, val);
505 }
506 
507 extern "C" __device__ uint32_t agg_count_float_shared(uint32_t* agg, const float val) {
508  return agg_count_int32_shared(agg, val);
509 }
510 
511 extern "C" __device__ int64_t agg_sum_shared(int64_t* agg, const int64_t val) {
512  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
513 }
514 
515 extern "C" __device__ int32_t agg_sum_int32_shared(int32_t* agg, const int32_t val) {
516  return atomicAdd(agg, val);
517 }
518 
519 extern "C" __device__ void agg_sum_float_shared(int32_t* agg, const float val) {
520  atomicAdd(reinterpret_cast<float*>(agg), val);
521 }
522 
523 extern "C" __device__ void agg_sum_double_shared(int64_t* agg, const double val) {
524  atomicAdd(reinterpret_cast<double*>(agg), val);
525 }
526 
527 extern "C" __device__ void agg_max_shared(int64_t* agg, const int64_t val) {
528  atomicMax64(agg, val);
529 }
530 
531 extern "C" __device__ void agg_max_int32_shared(int32_t* agg, const int32_t val) {
532  atomicMax(agg, val);
533 }
534 
535 extern "C" __device__ void agg_max_double_shared(int64_t* agg, const double val) {
536  atomicMax(reinterpret_cast<double*>(agg), val);
537 }
538 
539 extern "C" __device__ void agg_max_float_shared(int32_t* agg, const float val) {
540  atomicMax(reinterpret_cast<float*>(agg), val);
541 }
542 
543 extern "C" __device__ void agg_min_shared(int64_t* agg, const int64_t val) {
544  atomicMin64(agg, val);
545 }
546 
547 extern "C" __device__ void agg_min_int32_shared(int32_t* agg, const int32_t val) {
548  atomicMin(agg, val);
549 }
550 
551 // TODO(Saman): use 16-bit atomicCAS for Turing
552 extern "C" __device__ void atomicMax16(int16_t* agg, const int16_t val) {
553  // properly align the input pointer:
554  unsigned int* base_address_u32 =
555  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
556 
557  unsigned int old_value = *base_address_u32;
558  unsigned int swap_value, compare_value;
559  do {
560  compare_value = old_value;
561  swap_value =
562  (reinterpret_cast<size_t>(agg) & 0x2)
563  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
564  << 16 |
565  (old_value & 0xFFFF)
566  : (old_value & 0xFFFF0000) |
567  static_cast<unsigned int>(
568  max(static_cast<int16_t>(old_value & 0xFFFF), val));
569  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
570  } while (old_value != compare_value);
571 }
572 
573 extern "C" __device__ void atomicMax8(int8_t* agg, const int8_t val) {
574  // properly align the input pointer:
575  unsigned int* base_address_u32 =
576  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
577 
578  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
579  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
580  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
581  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
582  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
583  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
584  unsigned int old_value = *base_address_u32;
585  unsigned int swap_value, compare_value;
586  do {
587  compare_value = old_value;
588  auto max_value = static_cast<unsigned int>(
589  // compare val with its corresponding bits in the compare_value
590  max(val,
591  static_cast<int8_t>(__byte_perm(
592  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
593  swap_value = __byte_perm(
594  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
595  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
596  } while (compare_value != old_value);
597 }
598 
599 extern "C" __device__ void atomicMin16(int16_t* agg, const int16_t val) {
600  // properly align the input pointer:
601  unsigned int* base_address_u32 =
602  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
603 
604  unsigned int old_value = *base_address_u32;
605  unsigned int swap_value, compare_value;
606  do {
607  compare_value = old_value;
608  swap_value =
609  (reinterpret_cast<size_t>(agg) & 0x2)
610  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
611  << 16 |
612  (old_value & 0xFFFF)
613  : (old_value & 0xFFFF0000) |
614  static_cast<unsigned int>(
615  min(static_cast<int16_t>(old_value & 0xFFFF), val));
616  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
617  } while (old_value != compare_value);
618 }
619 
620 extern "C" __device__ void atomicMin16SkipVal(int16_t* agg,
621  const int16_t val,
622  const int16_t skip_val) {
623  // properly align the input pointer:
624  unsigned int* base_address_u32 =
625  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
626 
627  unsigned int old_value = *base_address_u32;
628  unsigned int swap_value, compare_value;
629  do {
630  compare_value = old_value;
631  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
632  ? static_cast<int16_t>(old_value >> 16)
633  : static_cast<int16_t>(old_value & 0xFFFF);
634 
635  swap_value =
636  (reinterpret_cast<size_t>(agg) & 0x2)
637  ? static_cast<unsigned int>(
638  selected_old_val == skip_val ? val : min(selected_old_val, val))
639  << 16 |
640  (old_value & 0xFFFF)
641  : (old_value & 0xFFFF0000) |
642  static_cast<unsigned int>(
643  selected_old_val == skip_val ? val : min(selected_old_val, val));
644  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
645  } while (old_value != compare_value);
646 }
647 
648 extern "C" __device__ void atomicMin8(int8_t* agg, const int8_t val) {
649  // properly align the input pointer:
650  unsigned int* base_address_u32 =
651  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
652 
653  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
654  unsigned int old_value = *base_address_u32;
655  unsigned int swap_value, compare_value;
656  do {
657  compare_value = old_value;
658  auto min_value = static_cast<unsigned int>(
659  min(val,
660  static_cast<int8_t>(__byte_perm(
661  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
662  swap_value = __byte_perm(
663  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
664  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
665  } while (compare_value != old_value);
666 }
667 
668 extern "C" __device__ void atomicMin8SkipVal(int8_t* agg,
669  const int8_t val,
670  const int8_t skip_val) {
671  // properly align the input pointer:
672  unsigned int* base_address_u32 =
673  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
674 
675  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
676  unsigned int old_value = *base_address_u32;
677  unsigned int swap_value, compare_value;
678  do {
679  compare_value = old_value;
680  int8_t selected_old_val = static_cast<int8_t>(
681  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
682  auto min_value = static_cast<unsigned int>(
683  selected_old_val == skip_val ? val : min(val, selected_old_val));
684  swap_value = __byte_perm(
685  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
686  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
687  } while (compare_value != old_value);
688 }
689 
690 extern "C" __device__ void agg_max_int16_shared(int16_t* agg, const int16_t val) {
691  return atomicMax16(agg, val);
692 }
693 
694 extern "C" __device__ void agg_max_int8_shared(int8_t* agg, const int8_t val) {
695  return atomicMax8(agg, val);
696 }
697 
698 extern "C" __device__ void agg_min_int16_shared(int16_t* agg, const int16_t val) {
699  return atomicMin16(agg, val);
700 }
701 
702 extern "C" __device__ void agg_min_int8_shared(int8_t* agg, const int8_t val) {
703  return atomicMin8(agg, val);
704 }
705 
706 extern "C" __device__ void agg_min_double_shared(int64_t* agg, const double val) {
707  atomicMin(reinterpret_cast<double*>(agg), val);
708 }
709 
710 extern "C" __device__ void agg_min_float_shared(int32_t* agg, const float val) {
711  atomicMin(reinterpret_cast<float*>(agg), val);
712 }
713 
714 extern "C" __device__ void agg_id_shared(int64_t* agg, const int64_t val) {
715  *agg = val;
716 }
717 
718 #define DEF_AGG_ID_INT_SHARED(n) \
719  extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
720  const int##n##_t val) { \
721  *agg = val; \
722  }
723 
727 #undef DEF_AGG_ID_INT_SHARED
728 
729 extern "C" __device__ void agg_id_double_shared(int64_t* agg, const double val) {
730  *agg = *(reinterpret_cast<const int64_t*>(&val));
731 }
732 
733 extern "C" __device__ void agg_id_double_shared_slow(int64_t* agg, const double* val) {
734  *agg = *(reinterpret_cast<const int64_t*>(val));
735 }
736 
737 extern "C" __device__ void agg_id_float_shared(int32_t* agg, const float val) {
738  *agg = __float_as_int(val);
739 }
740 
741 #define DEF_SKIP_AGG(base_agg_func) \
742  extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
743  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
744  if (val != skip_val) { \
745  return base_agg_func##_shared(agg, val); \
746  } \
747  return 0; \
748  }
749 
750 #define DATA_T int64_t
751 #define ADDR_T uint64_t
753 #undef DATA_T
754 #undef ADDR_T
755 
756 #define DATA_T int32_t
757 #define ADDR_T uint32_t
759 #undef DATA_T
760 #undef ADDR_T
761 
762 // Initial value for nullable column is INT32_MIN
763 extern "C" __device__ void agg_max_int32_skip_val_shared(int32_t* agg,
764  const int32_t val,
765  const int32_t skip_val) {
766  if (val != skip_val) {
767  agg_max_int32_shared(agg, val);
768  }
769 }
770 
771 extern "C" __device__ void agg_max_int16_skip_val_shared(int16_t* agg,
772  const int16_t val,
773  const int16_t skip_val) {
774  if (val != skip_val) {
775  agg_max_int16_shared(agg, val);
776  }
777 }
778 
779 extern "C" __device__ void agg_min_int16_skip_val_shared(int16_t* agg,
780  const int16_t val,
781  const int16_t skip_val) {
782  if (val != skip_val) {
783  atomicMin16SkipVal(agg, val, skip_val);
784  }
785 }
786 
787 extern "C" __device__ void agg_max_int8_skip_val_shared(int8_t* agg,
788  const int8_t val,
789  const int8_t skip_val) {
790  if (val != skip_val) {
791  agg_max_int8_shared(agg, val);
792  }
793 }
794 
795 extern "C" __device__ void agg_min_int8_skip_val_shared(int8_t* agg,
796  const int8_t val,
797  const int8_t skip_val) {
798  if (val != skip_val) {
799  atomicMin8SkipVal(agg, val, skip_val);
800  }
801 }
802 
803 __device__ int32_t atomicMin32SkipVal(int32_t* address,
804  int32_t val,
805  const int32_t skip_val) {
806  int32_t old = atomicExch(address, INT_MAX);
807  return atomicMin(address, old == skip_val ? val : min(old, val));
808 }
809 
810 extern "C" __device__ void agg_min_int32_skip_val_shared(int32_t* agg,
811  const int32_t val,
812  const int32_t skip_val) {
813  if (val != skip_val) {
814  atomicMin32SkipVal(agg, val, skip_val);
815  }
816 }
817 
818 __device__ int32_t atomicSum32SkipVal(int32_t* address,
819  const int32_t val,
820  const int32_t skip_val) {
821  unsigned int* address_as_int = (unsigned int*)address;
822  int32_t old = atomicExch(address_as_int, 0);
823  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
824  return old == skip_val ? old2 : (old2 + old);
825 }
826 
827 extern "C" __device__ int32_t agg_sum_int32_skip_val_shared(int32_t* agg,
828  const int32_t val,
829  const int32_t skip_val) {
830  if (val != skip_val) {
831  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
832  return old;
833  }
834  return 0;
835 }
836 
837 __device__ int64_t atomicSum64SkipVal(int64_t* address,
838  const int64_t val,
839  const int64_t skip_val) {
840  unsigned long long int* address_as_ull = (unsigned long long int*)address;
841  int64_t old = atomicExch(address_as_ull, 0);
842  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
843  return old == skip_val ? old2 : (old2 + old);
844 }
845 
846 extern "C" __device__ int64_t agg_sum_skip_val_shared(int64_t* agg,
847  const int64_t val,
848  const int64_t skip_val) {
849  if (val != skip_val) {
850  return atomicSum64SkipVal(agg, val, skip_val);
851  }
852  return 0;
853 }
854 
855 __device__ int64_t atomicMin64SkipVal(int64_t* address,
856  int64_t val,
857  const int64_t skip_val) {
858  unsigned long long int* address_as_ull =
859  reinterpret_cast<unsigned long long int*>(address);
860  unsigned long long int old = *address_as_ull, assumed;
861 
862  do {
863  assumed = old;
864  old = atomicCAS(address_as_ull,
865  assumed,
866  assumed == skip_val ? val : min((long long)val, (long long)assumed));
867  } while (assumed != old);
868 
869  return old;
870 }
871 
872 extern "C" __device__ void agg_min_skip_val_shared(int64_t* agg,
873  const int64_t val,
874  const int64_t skip_val) {
875  if (val != skip_val) {
876  atomicMin64SkipVal(agg, val, skip_val);
877  }
878 }
879 
880 __device__ int64_t atomicMax64SkipVal(int64_t* address,
881  int64_t val,
882  const int64_t skip_val) {
883  unsigned long long int* address_as_ull =
884  reinterpret_cast<unsigned long long int*>(address);
885  unsigned long long int old = *address_as_ull, assumed;
886 
887  do {
888  assumed = old;
889  old = atomicCAS(address_as_ull,
890  assumed,
891  assumed == skip_val ? val : max((long long)val, (long long)assumed));
892  } while (assumed != old);
893 
894  return old;
895 }
896 
897 extern "C" __device__ void agg_max_skip_val_shared(int64_t* agg,
898  const int64_t val,
899  const int64_t skip_val) {
900  if (val != skip_val) {
901  atomicMax64SkipVal(agg, val, skip_val);
902  }
903 }
904 
905 #undef DEF_SKIP_AGG
906 #define DEF_SKIP_AGG(base_agg_func) \
907  extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
908  ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
909  if (val != skip_val) { \
910  return base_agg_func##_shared(agg, val); \
911  } \
912  return *agg; \
913  }
914 
915 #define DATA_T double
916 #define ADDR_T uint64_t
918 #undef ADDR_T
919 #undef DATA_T
920 
921 #define DATA_T float
922 #define ADDR_T uint32_t
924 #undef ADDR_T
925 #undef DATA_T
926 
927 // Initial value for nullable column is FLOAT_MIN
928 extern "C" __device__ void agg_max_float_skip_val_shared(int32_t* agg,
929  const float val,
930  const float skip_val) {
931  if (__float_as_int(val) != __float_as_int(skip_val)) {
932  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
933  atomicMax(reinterpret_cast<float*>(agg),
934  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
935  }
936 }
937 
938 __device__ float atomicMinFltSkipVal(int32_t* address, float val, const float skip_val) {
939  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
940  return atomicMin(
941  reinterpret_cast<float*>(address),
942  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
943 }
944 
945 extern "C" __device__ void agg_min_float_skip_val_shared(int32_t* agg,
946  const float val,
947  const float skip_val) {
948  if (__float_as_int(val) != __float_as_int(skip_val)) {
949  atomicMinFltSkipVal(agg, val, skip_val);
950  }
951 }
952 
953 __device__ void atomicSumFltSkipVal(float* address,
954  const float val,
955  const float skip_val) {
956  float old = atomicExch(address, 0.f);
957  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
958 }
959 
960 extern "C" __device__ void agg_sum_float_skip_val_shared(int32_t* agg,
961  const float val,
962  const float skip_val) {
963  if (__float_as_int(val) != __float_as_int(skip_val)) {
964  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
965  }
966 }
967 
968 __device__ void atomicSumDblSkipVal(double* address,
969  const double val,
970  const double skip_val) {
971  unsigned long long int* address_as_ull = (unsigned long long int*)address;
972  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
973  atomicAdd(
974  address,
975  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
976 }
977 
978 extern "C" __device__ void agg_sum_double_skip_val_shared(int64_t* agg,
979  const double val,
980  const double skip_val) {
981  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
982  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
983  }
984 }
985 
986 __device__ double atomicMinDblSkipVal(double* address,
987  double val,
988  const double skip_val) {
989  unsigned long long int* address_as_ull =
990  reinterpret_cast<unsigned long long int*>(address);
991  unsigned long long int old = *address_as_ull;
992  unsigned long long int skip_val_as_ull =
993  *reinterpret_cast<const unsigned long long*>(&skip_val);
994  unsigned long long int assumed;
995 
996  do {
997  assumed = old;
998  old = atomicCAS(address_as_ull,
999  assumed,
1000  assumed == skip_val_as_ull
1001  ? *reinterpret_cast<unsigned long long*>(&val)
1002  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1003  } while (assumed != old);
1004 
1005  return __longlong_as_double(old);
1006 }
1007 
1008 extern "C" __device__ void agg_min_double_skip_val_shared(int64_t* agg,
1009  const double val,
1010  const double skip_val) {
1011  if (val != skip_val) {
1012  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1013  }
1014 }
1015 
1016 __device__ double atomicMaxDblSkipVal(double* address,
1017  double val,
1018  const double skip_val) {
1019  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1020  unsigned long long int old = *address_as_ull;
1021  unsigned long long int skip_val_as_ull = *((unsigned long long int*)&skip_val);
1022  unsigned long long int assumed;
1023 
1024  do {
1025  assumed = old;
1026  old = atomicCAS(address_as_ull,
1027  assumed,
1028  assumed == skip_val_as_ull
1029  ? *((unsigned long long int*)&val)
1030  : __double_as_longlong(max(val, __longlong_as_double(assumed))));
1031  } while (assumed != old);
1032 
1033  return __longlong_as_double(old);
1034 }
1035 
1036 extern "C" __device__ void agg_max_double_skip_val_shared(int64_t* agg,
1037  const double val,
1038  const double skip_val) {
1039  if (val != skip_val) {
1040  atomicMaxDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1041  }
1042 }
1043 
1044 #undef DEF_SKIP_AGG
1045 
1046 extern "C" __device__ bool slotEmptyKeyCAS(int64_t* slot,
1047  int64_t new_val,
1048  int64_t init_val) {
1049  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1050  const auto empty_key =
1051  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1052  const auto new_val_cast =
1053  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1054 
1055  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1056  if (old_val == *empty_key) {
1057  return true;
1058  } else {
1059  return false;
1060  }
1061 }
1062 
1063 extern "C" __device__ bool slotEmptyKeyCAS_int32(int32_t* slot,
1064  int32_t new_val,
1065  int32_t init_val) {
1066  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1067  unsigned int compare_value = static_cast<unsigned int>(init_val);
1068  unsigned int swap_value = static_cast<unsigned int>(new_val);
1069 
1070  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1071  return old_value == compare_value;
1072 }
1073 #include <stdio.h>
1074 extern "C" __device__ bool slotEmptyKeyCAS_int16(int16_t* slot,
1075  int16_t new_val,
1076  int16_t init_val) {
1077  unsigned int* base_slot_address =
1078  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1079  unsigned int old_value = *base_slot_address;
1080  unsigned int swap_value, compare_value;
1081  do {
1082  compare_value = old_value;
1083  // exit criteria: if init_val does not exist in the slot (some other thread has
1084  // succeeded)
1085  if (static_cast<unsigned int>(init_val) !=
1086  __byte_perm(
1087  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1088  return false;
1089  }
1090  swap_value = __byte_perm(compare_value,
1091  static_cast<unsigned int>(new_val),
1092  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1093  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1094  } while (compare_value != old_value);
1095  return true;
1096 }
1097 
1098 extern "C" __device__ bool slotEmptyKeyCAS_int8(int8_t* slot,
1099  int8_t new_val,
1100  int8_t init_val) {
1101  // properly align the slot address:
1102  unsigned int* base_slot_address =
1103  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1104  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1105  unsigned int old_value = *base_slot_address;
1106  unsigned int swap_value, compare_value;
1107  do {
1108  compare_value = old_value;
1109  // exit criteria: if init_val does not exist in the slot (some other thread has
1110  // succeeded)
1111  if (static_cast<unsigned int>(init_val) !=
1112  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1113  return false;
1114  }
1115  swap_value = __byte_perm(compare_value,
1116  static_cast<unsigned int>(new_val),
1117  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1118  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1119  } while (compare_value != old_value);
1120  return true;
1121 }
1122 
1123 #include "../Utils/ChunkIter.cpp"
1124 #include "DateTruncate.cpp"
1125 #include "ExtractFromTime.cpp"
1126 #define EXECUTE_INCLUDE
1127 #include "ArrayOps.cpp"
1128 #include "DateAdd.cpp"
1129 #include "StringFunctions.cpp"
1130 #undef EXECUTE_INCLUDE
1131 #include "../Utils/Regexp.cpp"
1132 #include "../Utils/StringLike.cpp"
1133 
1134 extern "C" __device__ uint64_t string_decode(int8_t* chunk_iter_, int64_t pos) {
1135  // TODO(alex): de-dup, the x64 version is basically identical
1136  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1137  VarlenDatum vd;
1138  bool is_end;
1139  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1140  return vd.is_null ? 0
1141  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
1142  (static_cast<uint64_t>(vd.length) << 48);
1143 }
1144 
1145 extern "C" __device__ void linear_probabilistic_count(uint8_t* bitmap,
1146  const uint32_t bitmap_bytes,
1147  const uint8_t* key_bytes,
1148  const uint32_t key_len) {
1149  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1150  const uint32_t word_idx = bit_pos / 32;
1151  const uint32_t bit_idx = bit_pos % 32;
1152  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1153 }
1154 
1155 extern "C" __device__ void agg_count_distinct_bitmap_gpu(int64_t* agg,
1156  const int64_t val,
1157  const int64_t min_val,
1158  const int64_t base_dev_addr,
1159  const int64_t base_host_addr,
1160  const uint64_t sub_bitmap_count,
1161  const uint64_t bitmap_bytes) {
1162  const uint64_t bitmap_idx = val - min_val;
1163  const uint32_t byte_idx = bitmap_idx >> 3;
1164  const uint32_t word_idx = byte_idx >> 2;
1165  const uint32_t byte_word_idx = byte_idx & 3;
1166  const int64_t host_addr = *agg;
1167  uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1168  (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1169  switch (byte_word_idx) {
1170  case 0:
1171  atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1172  break;
1173  case 1:
1174  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1175  break;
1176  case 2:
1177  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1178  break;
1179  case 3:
1180  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1181  break;
1182  default:
1183  break;
1184  }
1185 }
1186 
1187 extern "C" __device__ void agg_count_distinct_bitmap_skip_val_gpu(
1188  int64_t* agg,
1189  const int64_t val,
1190  const int64_t min_val,
1191  const int64_t skip_val,
1192  const int64_t base_dev_addr,
1193  const int64_t base_host_addr,
1194  const uint64_t sub_bitmap_count,
1195  const uint64_t bitmap_bytes) {
1196  if (val != skip_val) {
1198  agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1199  }
1200 }
1201 
1202 extern "C" __device__ void agg_approximate_count_distinct_gpu(
1203  int64_t* agg,
1204  const int64_t key,
1205  const uint32_t b,
1206  const int64_t base_dev_addr,
1207  const int64_t base_host_addr) {
1208  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1209  const uint32_t index = hash >> (64 - b);
1210  const int32_t rank = get_rank(hash << b, 64 - b);
1211  const int64_t host_addr = *agg;
1212  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1213  atomicMax(&M[index], rank);
1214 }
1215 
1216 extern "C" __device__ void force_sync() {
1217  __threadfence_block();
1218 }
1219 
1220 extern "C" __device__ void sync_warp() {
1221 #if (CUDA_VERSION >= 9000)
1222  __syncwarp();
1223 #endif
1224 }
1225 
1233 extern "C" __device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count) {
1234 #if (CUDA_VERSION >= 9000)
1235  // only syncing if NOT within the same warp as those threads experiencing the critical
1236  // edge
1237  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1238  __syncwarp();
1239  }
1240 #endif
1241 }
__device__ void sync_warp_protected(int64_t thread_pos, int64_t row_count)
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
__device__ void agg_from_smem_to_gmem_count_binId(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
const int32_t groups_buffer_size return groups_buffer
__device__ void agg_max_float_shared(int32_t *agg, const float val)
__device__ uint32_t agg_count_float_shared(uint32_t *agg, const float val)
__device__ void agg_count_distinct_bitmap_skip_val_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ bool dynamic_watchdog()
__device__ int64_t * get_matching_group_value_columnar(int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
ALWAYS_INLINE uint32_t agg_count_float(uint32_t *agg, const float val)
__device__ void agg_max_shared(int64_t *agg, const int64_t val)
#define EMPTY_KEY_64
__device__ void write_back_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:33
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
__device__ const int64_t * init_shared_mem(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:36
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
__device__ void agg_min_int32_shared(int32_t *agg, const int32_t val)
__device__ int8_t thread_warp_idx(const int8_t warp_sz)
Definition: cuda_mapd_rt.cu:23
__device__ int64_t dw_sm_cycle_start[128]
bool is_null
Definition: sqltypes.h:76
__device__ double atomicMaxDblSkipVal(double *address, double val, const double skip_val)
__device__ void agg_id_float_shared(int32_t *agg, const float val)
__device__ double atomicAdd(double *address, double val)
__device__ void agg_min_double_shared(int64_t *agg, const double val)
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)
__device__ int32_t pos_step_impl()
Definition: cuda_mapd_rt.cu:19
__device__ void agg_min_int8_shared(int8_t *agg, const int8_t val)
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)
__device__ const int64_t * init_shared_mem_nop(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:27
__device__ double atomicMin(double *address, double val)
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)
int64_t * src
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)
__device__ int64_t dw_cycle_budget
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)
__device__ void agg_id_double_shared_slow(int64_t *agg, const double *val)
ALWAYS_INLINE uint32_t agg_count_int32(uint32_t *agg, const int32_t)
const int64_t const uint32_t const uint32_t key_qw_count
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
__device__ void agg_min_float_shared(int32_t *agg, const float val)
__device__ int64_t atomicMin64(int64_t *address, int64_t val)
__device__ void agg_max_double_shared(int64_t *agg, const double val)
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)
int8_t * pointer
Definition: sqltypes.h:75
__device__ int32_t agg_sum_int32_shared(int32_t *agg, const int32_t val)
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void agg_sum_float_shared(int32_t *agg, const float val)
__device__ void agg_id_double_shared(int64_t *agg, const double val)
__device__ void agg_max_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ void atomicMax16(int16_t *agg, const int16_t val)
#define DEF_SKIP_AGG(base_agg_func)
__device__ void agg_min_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
__device__ bool slotEmptyKeyCAS_int32(int32_t *slot, int32_t new_val, int32_t init_val)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void agg_min_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ void linear_probabilistic_count(uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
__device__ const int64_t * init_shared_mem_dynamic(const int64_t *groups_buffer, const int32_t groups_buffer_size)
Definition: cuda_mapd_rt.cu:77
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)
__device__ void agg_sum_double_shared(int64_t *agg, const double val)
__inline__ __device__ uint32_t get_smid(void)
__device__ void agg_min_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)
__device__ int64_t atomicMax64(int64_t *address, int64_t val)
__device__ bool slotEmptyKeyCAS(int64_t *slot, int64_t new_val, int64_t init_val)
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:11
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
__device__ void atomicMin16(int16_t *agg, const int16_t val)
__device__ void agg_max_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)
ALWAYS_INLINE uint64_t agg_count_double(uint64_t *agg, const double val)
__device__ void agg_approximate_count_distinct_gpu(int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
__device__ void sync_warp()
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void agg_from_smem_to_gmem_nop(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
Definition: cuda_mapd_rt.cu:96
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
__device__ void agg_max_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ void agg_max_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ void atomicMin8(int8_t *agg, const int8_t val)
__device__ void agg_min_int16_shared(int16_t *agg, const int16_t val)
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)
__device__ void write_back(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:85
__device__ void agg_from_smem_to_gmem_binId_count(int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
__device__ void agg_min_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
#define DEF_AGG_ID_INT_SHARED(n)
__device__ uint64_t agg_count_double_shared(uint64_t *agg, const double val)
#define EMPTY_KEY_32
__device__ T get_empty_key()
__device__ void agg_min_int16_skip_val_shared(int16_t *agg, const int16_t val, const int16_t skip_val)
__device__ int64_t * alloc_shared_mem_dynamic()
Definition: cuda_mapd_rt.cu:51
__device__ void agg_min_int8_skip_val_shared(int8_t *agg, const int8_t val, const int8_t skip_val)
__device__ void atomicMax8(int8_t *agg, const int8_t val)
__device__ void agg_id_shared(int64_t *agg, const int64_t val)
__device__ double atomicMax(double *address, double val)
__device__ uint64_t string_decode(int8_t *chunk_iter_, int64_t pos)
const int64_t * init_vals
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)
__device__ void set_shared_mem_to_identity(int64_t *groups_buffer_smem, const int32_t groups_buffer_size, const int64_t identity_element=0)
Definition: cuda_mapd_rt.cu:61
__device__ void agg_max_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)
__device__ int32_t dw_abort
__device__ bool slotEmptyKeyCAS_int16(int16_t *slot, int16_t new_val, int16_t init_val)
__device__ void agg_max_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)
Functions to support array operations used by the executor.
__device__ void force_sync()
__device__ void write_back_smem_nop(int64_t *dest, int64_t *src, const int32_t sz)
Definition: cuda_mapd_rt.cu:92
__device__ void agg_min_shared(int64_t *agg, const int64_t val)
size_t length
Definition: sqltypes.h:74
__device__ bool slotEmptyKeyCAS_int8(int8_t *slot, int8_t new_val, int8_t init_val)
__device__ int32_t group_buff_idx_impl()
Definition: cuda_mapd_rt.cu:15