OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cuda_mapd_rt.cu File Reference
#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "GeoOps.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"
+ Include dependency graph for cuda_mapd_rt.cu:

Go to the source code of this file.

Macros

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu
 
#define DEF_AGG_ID_INT_SHARED(n)
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   int64_t
 
#define ADDR_T   uint64_t
 
#define DATA_T   int32_t
 
#define ADDR_T   uint32_t
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   double
 
#define ADDR_T   uint64_t
 
#define DATA_T   float
 
#define ADDR_T   uint32_t
 
#define EXECUTE_INCLUDE
 

Functions

__device__ int64_t get_thread_index ()
 
__device__ int64_t get_block_index ()
 
__device__ int32_t pos_start_impl (const int32_t *row_index_resume)
 
__device__ int32_t group_buff_idx_impl ()
 
__device__ int32_t pos_step_impl ()
 
__device__ int8_t thread_warp_idx (const int8_t warp_sz)
 
__device__ const int64_t * init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ int64_t * declare_dynamic_shared_memory ()
 
__device__ const int64_t * init_shared_mem (const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
 
__inline__ __device__ uint32_t get_smid (void)
 
__device__ bool dynamic_watchdog ()
 
__device__ bool check_interrupt ()
 
template<typename T = unsigned long long>
__device__ T get_empty_key ()
 
template<>
__device__ unsigned int get_empty_key ()
 
template<typename T >
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
 
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad)
 
template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
 
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width)
 
__device__ int64_t * get_matching_group_value_columnar (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
 
__device__ int64_t atomicMax64 (int64_t *address, int64_t val)
 
__device__ int64_t atomicMin64 (int64_t *address, int64_t val)
 
__device__ double atomicMax (double *address, double val)
 
__device__ float atomicMax (float *address, float val)
 
__device__ double atomicMin (double *address, double val)
 
__device__ double atomicMin (float *address, float val)
 
__device__ uint64_t agg_count_shared (uint64_t *agg, const int64_t val)
 
__device__ uint32_t agg_count_int32_shared (uint32_t *agg, const int32_t val)
 
__device__ uint64_t agg_count_double_shared (uint64_t *agg, const double val)
 
__device__ uint32_t agg_count_float_shared (uint32_t *agg, const float val)
 
__device__ int64_t agg_sum_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t agg_sum_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_sum_float_shared (int32_t *agg, const float val)
 
__device__ void agg_sum_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_max_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_max_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_float_shared (int32_t *agg, const float val)
 
__device__ void agg_min_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_min_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void atomicMax16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMax8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void atomicMin8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_max_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_max_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_min_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_double_shared (int64_t *agg, const double val)
 
__device__ void agg_min_float_shared (int32_t *agg, const float val)
 
__device__ void agg_id_shared (int64_t *agg, const int64_t val)
 
__device__ int8_t * agg_id_varlen_shared (int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
 
__device__ int32_t checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)
 
__device__ void agg_id_double_shared (int64_t *agg, const double val)
 
__device__ int32_t checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)
 
__device__ void agg_id_double_shared_slow (int64_t *agg, const double *val)
 
__device__ int32_t checked_single_agg_id_double_shared_slow (int64_t *agg, const double *valp, const double null_val)
 
__device__ void agg_id_float_shared (int32_t *agg, const float val)
 
__device__ int32_t checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)
 
__device__ void agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ void agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ int32_t atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)
 
__device__ void agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int64_t atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ void agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ float atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)
 
__device__ void agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumFltSkipVal (float *address, const float val, const float skip_val)
 
__device__ void agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumDblSkipVal (double *address, const double val, const double skip_val)
 
__device__ void agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ double atomicMinDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ void agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ bool slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)
 
__device__ bool slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)
 
__device__ bool slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)
 
__device__ bool slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)
 
__device__ uint64_t string_decode (int8_t *chunk_iter_, int64_t pos)
 
__device__ void linear_probabilistic_count (uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
 
__device__ void agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
 
__device__ void force_sync ()
 
__device__ void sync_warp ()
 
__device__ void sync_warp_protected (int64_t thread_pos, int64_t row_count)
 
__device__ void sync_threadblock ()
 
__device__ void write_back_non_grouped_agg (int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
 

Variables

__device__ int64_t dw_sm_cycle_start [128]
 
__device__ int64_t dw_cycle_budget = 0
 
__device__ int32_t dw_abort = 0
 
__device__ int32_t runtime_interrupt_flag = 0
 

Macro Definition Documentation

#define ADDR_T   uint64_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define ADDR_T   uint64_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define DATA_T   int64_t

Definition at line 1004 of file cuda_mapd_rt.cu.

#define DATA_T   int32_t

Definition at line 1004 of file cuda_mapd_rt.cu.

#define DATA_T   double

Definition at line 1004 of file cuda_mapd_rt.cu.

#define DATA_T   float

Definition at line 1004 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED (   n)
Value:
extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}
constexpr double n
Definition: Utm.h:38

Definition at line 717 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return 0; \
}
#define DATA_T
#define ADDR_T

Definition at line 989 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return *agg; \
}
#define DATA_T
#define ADDR_T

Definition at line 989 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1194 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu

Definition at line 82 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu ( int64_t *  agg,
const int64_t  key,
const uint32_t  b,
const int64_t  base_dev_addr,
const int64_t  base_host_addr 
)

Definition at line 1271 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

1276  {
1277  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1278  const uint32_t index = hash >> (64 - b);
1279  const int32_t rank = get_rank(hash << b, 64 - b);
1280  const int64_t host_addr = *agg;
1281  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1282  atomicMax(&M[index], rank);
1283 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1224 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

1230  {
1231  const uint64_t bitmap_idx = val - min_val;
1232  const uint32_t byte_idx = bitmap_idx >> 3;
1233  const uint32_t word_idx = byte_idx >> 2;
1234  const uint32_t byte_word_idx = byte_idx & 3;
1235  const int64_t host_addr = *agg;
1236  uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1237  (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1238  switch (byte_word_idx) {
1239  case 0:
1240  atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1241  break;
1242  case 1:
1243  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1244  break;
1245  case 2:
1246  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1247  break;
1248  case 3:
1249  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1250  break;
1251  default:
1252  break;
1253  }
1254 }

+ Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  skip_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1256 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

1264  {
1265  if (val != skip_val) {
1267  agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1268  }
1269 }
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

+ Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared ( uint64_t *  agg,
const double  val 
)

Definition at line 438 of file cuda_mapd_rt.cu.

References agg_count_shared().

438  {
439  return agg_count_shared(agg, val);
440 }
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared ( uint32_t *  agg,
const float  val 
)

Definition at line 442 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

442  {
443  return agg_count_int32_shared(agg, val);
444 }
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_int32_shared ( uint32_t *  agg,
const int32_t  val 
)

Definition at line 434 of file cuda_mapd_rt.cu.

Referenced by agg_count_float_shared().

434  {
435  return atomicAdd(agg, 1UL);
436 }

+ Here is the caller graph for this function:

__device__ uint64_t agg_count_shared ( uint64_t *  agg,
const int64_t  val 
)

Definition at line 430 of file cuda_mapd_rt.cu.

Referenced by agg_count_double_shared().

430  {
431  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1UL));
432 }

+ Here is the caller graph for this function:

__device__ void agg_id_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 729 of file cuda_mapd_rt.cu.

729  {
730  *agg = *(reinterpret_cast<const int64_t*>(&val));
731 }
__device__ void agg_id_double_shared_slow ( int64_t *  agg,
const double *  val 
)

Definition at line 760 of file cuda_mapd_rt.cu.

760  {
761  *agg = *(reinterpret_cast<const int64_t*>(val));
762 }
__device__ void agg_id_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 793 of file cuda_mapd_rt.cu.

793  {
794  *agg = __float_as_int(val);
795 }
__device__ void agg_id_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 676 of file cuda_mapd_rt.cu.

676  {
677  *agg = val;
678 }
__device__ int8_t* agg_id_varlen_shared ( int8_t *  varlen_buffer,
const int64_t  offset,
const int8_t *  value,
const int64_t  size_bytes 
)

Definition at line 680 of file cuda_mapd_rt.cu.

683  {
684  for (auto i = 0; i < size_bytes; i++) {
685  varlen_buffer[offset + i] = value[i];
686  }
687  return &varlen_buffer[offset];
688 }
__device__ void agg_max_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 470 of file cuda_mapd_rt.cu.

References atomicMax().

470  {
471  atomicMax(reinterpret_cast<double*>(agg), val);
472 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1099 of file cuda_mapd_rt.cu.

References atomicMax().

1101  {
1102  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1103  double old = __longlong_as_double(atomicExch(
1104  reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1105  atomicMax(reinterpret_cast<double*>(agg),
1106  __double_as_longlong(old) == __double_as_longlong(skip_val)
1107  ? val
1108  : fmax(old, val));
1109  }
1110 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 474 of file cuda_mapd_rt.cu.

References atomicMax().

474  {
475  atomicMax(reinterpret_cast<float*>(agg), val);
476 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1011 of file cuda_mapd_rt.cu.

References atomicMax().

1013  {
1014  if (__float_as_int(val) != __float_as_int(skip_val)) {
1015  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1016  atomicMax(reinterpret_cast<float*>(agg),
1017  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1018  }
1019 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 652 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

652  {
653  return atomicMax16(agg, val);
654 }
__device__ void atomicMax16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 854 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

856  {
857  if (val != skip_val) {
858  agg_max_int16_shared(agg, val);
859  }
860 }
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 466 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

466  {
467  atomicMax(agg, val);
468 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 846 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

848  {
849  if (val != skip_val) {
850  agg_max_int32_shared(agg, val);
851  }
852 }
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 656 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

656  {
657  return atomicMax8(agg, val);
658 }
__device__ void atomicMax8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 870 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

872  {
873  if (val != skip_val) {
874  agg_max_int8_shared(agg, val);
875  }
876 }
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_max_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 462 of file cuda_mapd_rt.cu.

References atomicMax64().

462  {
463  atomicMax64(agg, val);
464 }
__device__ int64_t atomicMax64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_max_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 980 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

982  {
983  if (val != skip_val) {
984  atomicMax64SkipVal(agg, val, skip_val);
985  }
986 }
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 668 of file cuda_mapd_rt.cu.

References atomicMin().

668  {
669  atomicMin(reinterpret_cast<double*>(agg), val);
670 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1091 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

1093  {
1094  if (val != skip_val) {
1095  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1096  }
1097 }
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 672 of file cuda_mapd_rt.cu.

References atomicMin().

672  {
673  atomicMin(reinterpret_cast<float*>(agg), val);
674 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1028 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

1030  {
1031  if (__float_as_int(val) != __float_as_int(skip_val)) {
1032  atomicMinFltSkipVal(agg, val, skip_val);
1033  }
1034 }
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 660 of file cuda_mapd_rt.cu.

References atomicMin16().

660  {
661  return atomicMin16(agg, val);
662 }
__device__ void atomicMin16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 862 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

864  {
865  if (val != skip_val) {
866  atomicMin16SkipVal(agg, val, skip_val);
867  }
868 }
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 482 of file cuda_mapd_rt.cu.

References atomicMin().

482  {
483  atomicMin(agg, val);
484 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 893 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

895  {
896  if (val != skip_val) {
897  atomicMin32SkipVal(agg, val, skip_val);
898  }
899 }
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 664 of file cuda_mapd_rt.cu.

References atomicMin8().

664  {
665  return atomicMin8(agg, val);
666 }
__device__ void atomicMin8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 878 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

880  {
881  if (val != skip_val) {
882  atomicMin8SkipVal(agg, val, skip_val);
883  }
884 }
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 478 of file cuda_mapd_rt.cu.

References atomicMin64().

478  {
479  atomicMin64(agg, val);
480 }
__device__ int64_t atomicMin64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_min_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 955 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

957  {
958  if (val != skip_val) {
959  atomicMin64SkipVal(agg, val, skip_val);
960  }
961 }
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 458 of file cuda_mapd_rt.cu.

458  {
459  atomicAdd(reinterpret_cast<double*>(agg), val);
460 }
__device__ void agg_sum_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1061 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

1063  {
1064  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1065  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1066  }
1067 }
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 454 of file cuda_mapd_rt.cu.

454  {
455  atomicAdd(reinterpret_cast<float*>(agg), val);
456 }
__device__ void agg_sum_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1043 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

1045  {
1046  if (__float_as_int(val) != __float_as_int(skip_val)) {
1047  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
1048  }
1049 }
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 450 of file cuda_mapd_rt.cu.

450  {
451  return atomicAdd(agg, val);
452 }
__device__ int32_t agg_sum_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 910 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

912  {
913  if (val != skip_val) {
914  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
915  return old;
916  }
917  return 0;
918 }
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 446 of file cuda_mapd_rt.cu.

Referenced by write_back_non_grouped_agg().

446  {
447  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
448 }

+ Here is the caller graph for this function:

__device__ int64_t agg_sum_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 929 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

931  {
932  if (val != skip_val) {
933  return atomicSum64SkipVal(agg, val, skip_val);
934  }
935  return 0;
936 }
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ double atomicMax ( double *  address,
double  val 
)

Definition at line 372 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_double_skip_val_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

372  {
373  unsigned long long int* address_as_ull = (unsigned long long int*)address;
374  unsigned long long int old = *address_as_ull, assumed;
375 
376  do {
377  assumed = old;
378  old = atomicCAS(address_as_ull,
379  assumed,
380  __double_as_longlong(max(val, __longlong_as_double(assumed))));
381 
382  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
383  } while (assumed != old);
384 
385  return __longlong_as_double(old);
386 }

+ Here is the caller graph for this function:

__device__ float atomicMax ( float *  address,
float  val 
)

Definition at line 388 of file cuda_mapd_rt.cu.

388  {
389  int* address_as_int = (int*)address;
390  int old = *address_as_int, assumed;
391 
392  do {
393  assumed = old;
394  old = atomicCAS(
395  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
396 
397  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
398  } while (assumed != old);
399 
400  return __int_as_float(old);
401 }
__device__ void atomicMax16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 500 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

500  {
501  // properly align the input pointer:
502  unsigned int* base_address_u32 =
503  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
504 
505  unsigned int old_value = *base_address_u32;
506  unsigned int swap_value, compare_value;
507  do {
508  compare_value = old_value;
509  swap_value =
510  (reinterpret_cast<size_t>(agg) & 0x2)
511  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
512  << 16 |
513  (old_value & 0xFFFF)
514  : (old_value & 0xFFFF0000) |
515  static_cast<unsigned int>(
516  max(static_cast<int16_t>(old_value & 0xFFFF), val));
517  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
518  } while (old_value != compare_value);
519 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64 ( int64_t *  address,
int64_t  val 
)

Definition at line 330 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

330  {
331  unsigned long long int* address_as_ull = (unsigned long long int*)address;
332  unsigned long long int old = *address_as_ull, assumed;
333 
334  do {
335  assumed = old;
336  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
337  } while (assumed != old);
338 
339  return old;
340 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 963 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

965  {
966  unsigned long long int* address_as_ull =
967  reinterpret_cast<unsigned long long int*>(address);
968  unsigned long long int old = *address_as_ull, assumed;
969 
970  do {
971  assumed = old;
972  old = atomicCAS(address_as_ull,
973  assumed,
974  assumed == skip_val ? val : max((long long)val, (long long)assumed));
975  } while (assumed != old);
976 
977  return old;
978 }

+ Here is the caller graph for this function:

__device__ void atomicMax8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 522 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

522  {
523  // properly align the input pointer:
524  unsigned int* base_address_u32 =
525  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
526 
527  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
528  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
529  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
530  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
531  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
532  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
533  unsigned int old_value = *base_address_u32;
534  unsigned int swap_value, compare_value;
535  do {
536  compare_value = old_value;
537  auto max_value = static_cast<unsigned int>(
538  // compare val with its corresponding bits in the compare_value
539  max(val,
540  static_cast<int8_t>(__byte_perm(
541  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
542  swap_value = __byte_perm(
543  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
544  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
545  } while (compare_value != old_value);
546 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( double *  address,
double  val 
)

Definition at line 403 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

403  {
404  unsigned long long int* address_as_ull = (unsigned long long int*)address;
405  unsigned long long int old = *address_as_ull, assumed;
406 
407  do {
408  assumed = old;
409  old = atomicCAS(address_as_ull,
410  assumed,
411  __double_as_longlong(min(val, __longlong_as_double(assumed))));
412  } while (assumed != old);
413 
414  return __longlong_as_double(old);
415 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( float *  address,
float  val 
)

Definition at line 417 of file cuda_mapd_rt.cu.

417  {
418  int* address_as_ull = (int*)address;
419  int old = *address_as_ull, assumed;
420 
421  do {
422  assumed = old;
423  old = atomicCAS(
424  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
425  } while (assumed != old);
426 
427  return __int_as_float(old);
428 }
__device__ void atomicMin16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 562 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

562  {
563  // properly align the input pointer:
564  unsigned int* base_address_u32 =
565  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
566 
567  unsigned int old_value = *base_address_u32;
568  unsigned int swap_value, compare_value;
569  do {
570  compare_value = old_value;
571  swap_value =
572  (reinterpret_cast<size_t>(agg) & 0x2)
573  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
574  << 16 |
575  (old_value & 0xFFFF)
576  : (old_value & 0xFFFF0000) |
577  static_cast<unsigned int>(
578  min(static_cast<int16_t>(old_value & 0xFFFF), val));
579  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
580  } while (old_value != compare_value);
581 }

+ Here is the caller graph for this function:

__device__ void atomicMin16SkipVal ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 584 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

586  {
587  // properly align the input pointer:
588  unsigned int* base_address_u32 =
589  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
590 
591  unsigned int old_value = *base_address_u32;
592  unsigned int swap_value, compare_value;
593  do {
594  compare_value = old_value;
595  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
596  ? static_cast<int16_t>(old_value >> 16)
597  : static_cast<int16_t>(old_value & 0xFFFF);
598 
599  swap_value =
600  (reinterpret_cast<size_t>(agg) & 0x2)
601  ? static_cast<unsigned int>(
602  selected_old_val == skip_val ? val : min(selected_old_val, val))
603  << 16 |
604  (old_value & 0xFFFF)
605  : (old_value & 0xFFFF0000) |
606  static_cast<unsigned int>(
607  selected_old_val == skip_val ? val : min(selected_old_val, val));
608  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
609  } while (old_value != compare_value);
610 }

+ Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal ( int32_t *  address,
int32_t  val,
const int32_t  skip_val 
)

Definition at line 886 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

888  {
889  int32_t old = atomicExch(address, INT_MAX);
890  return atomicMin(address, old == skip_val ? val : min(old, val));
891 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64 ( int64_t *  address,
int64_t  val 
)

Definition at line 342 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

342  {
343  unsigned long long int* address_as_ull = (unsigned long long int*)address;
344  unsigned long long int old = *address_as_ull, assumed;
345 
346  do {
347  assumed = old;
348  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
349  } while (assumed != old);
350 
351  return old;
352 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 938 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

940  {
941  unsigned long long int* address_as_ull =
942  reinterpret_cast<unsigned long long int*>(address);
943  unsigned long long int old = *address_as_ull, assumed;
944 
945  do {
946  assumed = old;
947  old = atomicCAS(address_as_ull,
948  assumed,
949  assumed == skip_val ? val : min((long long)val, (long long)assumed));
950  } while (assumed != old);
951 
952  return old;
953 }

+ Here is the caller graph for this function:

__device__ void atomicMin8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 612 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

612  {
613  // properly align the input pointer:
614  unsigned int* base_address_u32 =
615  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
616 
617  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
618  unsigned int old_value = *base_address_u32;
619  unsigned int swap_value, compare_value;
620  do {
621  compare_value = old_value;
622  auto min_value = static_cast<unsigned int>(
623  min(val,
624  static_cast<int8_t>(__byte_perm(
625  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
626  swap_value = __byte_perm(
627  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
628  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
629  } while (compare_value != old_value);
630 }

+ Here is the caller graph for this function:

__device__ void atomicMin8SkipVal ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 632 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

632  {
633  // properly align the input pointer:
634  unsigned int* base_address_u32 =
635  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
636 
637  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
638  unsigned int old_value = *base_address_u32;
639  unsigned int swap_value, compare_value;
640  do {
641  compare_value = old_value;
642  int8_t selected_old_val = static_cast<int8_t>(
643  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
644  auto min_value = static_cast<unsigned int>(
645  selected_old_val == skip_val ? val : min(val, selected_old_val));
646  swap_value = __byte_perm(
647  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
648  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
649  } while (compare_value != old_value);
650 }

+ Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1069 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

1071  {
1072  unsigned long long int* address_as_ull =
1073  reinterpret_cast<unsigned long long int*>(address);
1074  unsigned long long int old = *address_as_ull;
1075  unsigned long long int skip_val_as_ull =
1076  *reinterpret_cast<const unsigned long long*>(&skip_val);
1077  unsigned long long int assumed;
1078 
1079  do {
1080  assumed = old;
1081  old = atomicCAS(address_as_ull,
1082  assumed,
1083  assumed == skip_val_as_ull
1084  ? *reinterpret_cast<unsigned long long*>(&val)
1085  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1086  } while (assumed != old);
1087 
1088  return __longlong_as_double(old);
1089 }

+ Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal ( int32_t *  address,
float  val,
const float  skip_val 
)

Definition at line 1021 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

1021  {
1022  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1023  return atomicMin(
1024  reinterpret_cast<float*>(address),
1025  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1026 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal ( int32_t *  address,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 901 of file cuda_mapd_rt.cu.

Referenced by agg_sum_int32_skip_val_shared().

903  {
904  unsigned int* address_as_int = (unsigned int*)address;
905  int32_t old = atomicExch(address_as_int, 0);
906  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
907  return old == skip_val ? old2 : (old2 + old);
908 }

+ Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal ( int64_t *  address,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 920 of file cuda_mapd_rt.cu.

Referenced by agg_sum_skip_val_shared().

922  {
923  unsigned long long int* address_as_ull = (unsigned long long int*)address;
924  int64_t old = atomicExch(address_as_ull, 0);
925  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
926  return old == skip_val ? old2 : (old2 + old);
927 }

+ Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal ( double *  address,
const double  val,
const double  skip_val 
)

Definition at line 1051 of file cuda_mapd_rt.cu.

Referenced by agg_sum_double_skip_val_shared().

1053  {
1054  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1055  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1056  atomicAdd(
1057  address,
1058  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1059 }

+ Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal ( float *  address,
const float  val,
const float  skip_val 
)

Definition at line 1036 of file cuda_mapd_rt.cu.

References anonymous_namespace{Utm.h}::f.

Referenced by agg_sum_float_skip_val_shared().

1038  {
1039  float old = atomicExch(address, 0.f);
1040  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1041 }
constexpr double f
Definition: Utm.h:31

+ Here is the caller graph for this function:

__device__ bool check_interrupt ( )

Definition at line 159 of file cuda_mapd_rt.cu.

Referenced by check_interrupt_rt(), and ColumnFetcher::linearizeFixedLenArrayColFrags().

159  {
160  return (runtime_interrupt_flag == 1) ? true : false;
161 }
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95

+ Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared ( int64_t *  agg,
const double  val,
const double  null_val 
)

Definition at line 733 of file cuda_mapd_rt.cu.

735  {
736  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
737  unsigned long long int old = *address_as_ull, assumed;
738 
739  if (val == null_val) {
740  return 0;
741  }
742 
743  do {
744  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
745  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
746  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
747  return 15;
748  } else {
749  break;
750  }
751  }
752 
753  assumed = old;
754  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
755  } while (assumed != old);
756 
757  return 0;
758 }
__device__ int32_t checked_single_agg_id_double_shared_slow ( int64_t *  agg,
const double *  valp,
const double  null_val 
)

Definition at line 765 of file cuda_mapd_rt.cu.

767  {
768  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
769  unsigned long long int old = *address_as_ull, assumed;
770  double val = *valp;
771 
772  if (val == null_val) {
773  return 0;
774  }
775 
776  do {
777  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
778  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
779  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
780  return 15;
781  } else {
782  break;
783  }
784  }
785 
786  assumed = old;
787  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
788  } while (assumed != old);
789 
790  return 0;
791 }
__device__ int32_t checked_single_agg_id_float_shared ( int32_t *  agg,
const float  val,
const float  null_val 
)

Definition at line 797 of file cuda_mapd_rt.cu.

799  {
800  int* address_as_ull = reinterpret_cast<int*>(agg);
801  int old = *address_as_ull, assumed;
802 
803  if (val == null_val) {
804  return 0;
805  }
806 
807  do {
808  if (old != __float_as_int(null_val)) {
809  if (old != __float_as_int(val)) {
810  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
811  return 15;
812  } else {
813  break;
814  }
815  }
816 
817  assumed = old;
818  old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
819  } while (assumed != old);
820 
821  return 0;
822 }
__device__ int32_t checked_single_agg_id_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  null_val 
)

Definition at line 690 of file cuda_mapd_rt.cu.

692  {
693  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
694  unsigned long long int old = *address_as_ull, assumed;
695 
696  if (val == null_val) {
697  return 0;
698  }
699 
700  do {
701  if (static_cast<int64_t>(old) != null_val) {
702  if (static_cast<int64_t>(old) != val) {
703  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
704  return 15;
705  } else {
706  break;
707  }
708  }
709 
710  assumed = old;
711  old = atomicCAS(address_as_ull, assumed, val);
712  } while (assumed != old);
713 
714  return 0;
715 }
__device__ int64_t* declare_dynamic_shared_memory ( )

Definition at line 56 of file cuda_mapd_rt.cu.

56  {
57  extern __shared__ int64_t shared_mem_buffer[];
58  return shared_mem_buffer;
59 }
__device__ bool dynamic_watchdog ( )

Definition at line 115 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), anonymous_namespace{ResultSetReduction.cpp}::check_watchdog_with_seed(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

115  {
116  // check for dynamic watchdog, if triggered all threads return true
117  if (dw_cycle_budget == 0LL) {
118  return false; // Uninitialized watchdog can't check time
119  }
120  if (dw_abort == 1) {
121  return true; // Received host request to abort
122  }
123  uint32_t smid = get_smid();
124  if (smid >= 128) {
125  return false;
126  }
127  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
128  __shared__ volatile bool
129  dw_should_terminate; // all threads within a block should return together if
130  // watchdog criteria is met
131 
132  // thread 0 either initializes or read the initial clock cycle, the result is stored
133  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
134  // point in using more threads here.
135  if (threadIdx.x == 0) {
136  dw_block_cycle_start = 0LL;
137  int64_t cycle_count = static_cast<int64_t>(clock64());
138  // Make sure the block hasn't switched SMs
139  if (smid == get_smid()) {
140  dw_block_cycle_start = static_cast<int64_t>(
141  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
142  0ULL,
143  static_cast<unsigned long long>(cycle_count)));
144  }
145 
146  int64_t cycles = cycle_count - dw_block_cycle_start;
147  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
148  (cycles > dw_cycle_budget)) {
149  // Check if we're out of time on this particular SM
150  dw_should_terminate = true;
151  } else {
152  dw_should_terminate = false;
153  }
154  }
155  __syncthreads();
156  return dw_should_terminate;
157 }
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
__inline__ __device__ uint32_t get_smid(void)
Definition: cuda_mapd_rt.cu:97
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94

+ Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1285 of file cuda_mapd_rt.cu.

1285  {
1286  __threadfence_block();
1287 }
__device__ int64_t get_block_index ( )

Definition at line 23 of file cuda_mapd_rt.cu.

23  {
24  return blockIdx.x;
25 }
template<typename T = unsigned long long>
__device__ T get_empty_key ( )
inline

Definition at line 164 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

164  {
165  return EMPTY_KEY_64;
166 }
#define EMPTY_KEY_64
template<>
__device__ unsigned int get_empty_key ( )
inline

Definition at line 169 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

169  {
170  return EMPTY_KEY_32;
171 }
#define EMPTY_KEY_32
template<typename T >
__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const T *  key,
const uint32_t  key_count,
const uint32_t  row_size_quad 
)
inline

Definition at line 174 of file cuda_mapd_rt.cu.

References align_to_int64(), and heavydb.dtypes::T.

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

178  {
179  const T empty_key = get_empty_key<T>();
180  uint32_t off = h * row_size_quad;
181  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
182  {
183  const T old = atomicCAS(row_ptr, empty_key, *key);
184  if (empty_key == old && key_count > 1) {
185  for (size_t i = 1; i <= key_count - 1; ++i) {
186  atomicExch(row_ptr + i, key[i]);
187  }
188  }
189  }
190  if (key_count > 1) {
191  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
192  // spin until the winning thread has finished writing the entire key and the init
193  // value
194  }
195  }
196  bool match = true;
197  for (uint32_t i = 0; i < key_count; ++i) {
198  if (row_ptr[i] != key[i]) {
199  match = false;
200  break;
201  }
202  }
203 
204  if (match) {
205  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
206  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
207  }
208  return NULL;
209 }
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad 
)

Definition at line 211 of file cuda_mapd_rt.cu.

References get_matching_group_value().

216  {
217  switch (key_width) {
218  case 4:
219  return get_matching_group_value(groups_buffer,
220  h,
221  reinterpret_cast<const unsigned int*>(key),
222  key_count,
223  row_size_quad);
224  case 8:
225  return get_matching_group_value(groups_buffer,
226  h,
227  reinterpret_cast<const unsigned long long*>(key),
228  key_count,
229  row_size_quad);
230  default:
231  return NULL;
232  }
233 }
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)

+ Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_qw_count,
const size_t  entry_count 
)

Definition at line 296 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

301  {
302  uint32_t off = h;
303  {
304  const uint64_t old = atomicCAS(
305  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
306  if (EMPTY_KEY_64 == old) {
307  for (size_t i = 0; i < key_qw_count; ++i) {
308  groups_buffer[off] = key[i];
309  off += entry_count;
310  }
311  return &groups_buffer[off];
312  }
313  }
314  __syncthreads();
315  off = h;
316  for (size_t i = 0; i < key_qw_count; ++i) {
317  if (groups_buffer[off] != key[i]) {
318  return NULL;
319  }
320  off += entry_count;
321  }
322  return &groups_buffer[off];
323 }
#define EMPTY_KEY_64

+ Here is the caller graph for this function:

template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const T *  key,
const uint32_t  key_count 
)

Definition at line 236 of file cuda_mapd_rt.cu.

References heavydb.dtypes::T.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

240  {
241  const T empty_key = get_empty_key<T>();
242  const uint64_t old =
243  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
244  // the winner thread proceeds with writing the rest fo the keys
245  if (old == empty_key) {
246  uint32_t offset = h + entry_count;
247  for (size_t i = 1; i < key_count; ++i) {
248  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
249  offset += entry_count;
250  }
251  }
252 
253  __threadfence();
254  // for all threads except the winning thread, memory content of the keys
255  // related to the hash offset are checked again. In case of a complete match
256  // the hash offset is returned, otherwise -1 is returned
257  if (old != empty_key) {
258  uint32_t offset = h;
259  for (uint32_t i = 0; i < key_count; ++i) {
260  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
261  return -1;
262  }
263  offset += entry_count;
264  }
265  }
266  return h;
267 }

+ Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width 
)

Definition at line 270 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

275  {
276  switch (key_width) {
277  case 4:
279  groups_buffer,
280  entry_count,
281  h,
282  reinterpret_cast<const unsigned int*>(key),
283  key_count);
284  case 8:
286  groups_buffer,
287  entry_count,
288  h,
289  reinterpret_cast<const unsigned long long*>(key),
290  key_count);
291  default:
292  return -1;
293  }
294 }
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)

+ Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void  )

Definition at line 97 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

97  {
98  uint32_t ret;
99  asm("mov.u32 %0, %%smid;" : "=r"(ret));
100  return ret;
101 }

+ Here is the caller graph for this function:

__device__ int64_t get_thread_index ( )

Definition at line 19 of file cuda_mapd_rt.cu.

19  {
20  return threadIdx.x;
21 }
__device__ int32_t group_buff_idx_impl ( )

Definition at line 31 of file cuda_mapd_rt.cu.

References pos_start_impl().

31  {
32  return pos_start_impl(NULL);
33 }
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem ( const int64_t *  global_groups_buffer,
const int32_t  groups_buffer_size 
)

Initializes the shared memory buffer for perfect hash group by. In this function, we simply copy the global group by buffer (already initialized on the host and transferred) to all shared memory group by buffers.

Definition at line 66 of file cuda_mapd_rt.cu.

67  {
68  // dynamic shared memory declaration
69  extern __shared__ int64_t shared_groups_buffer[];
70 
71  // it is assumed that buffer size is aligned with 64-bit units
72  // so it is safe to assign 64-bit to each thread
73  const int32_t buffer_units = groups_buffer_size >> 3;
74 
75  for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
76  shared_groups_buffer[pos] = global_groups_buffer[pos];
77  }
78  __syncthreads();
79  return shared_groups_buffer;
80 }
__device__ const int64_t* init_shared_mem_nop ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 43 of file cuda_mapd_rt.cu.

45  {
46  return groups_buffer;
47 }
__device__ void linear_probabilistic_count ( uint8_t *  bitmap,
const uint32_t  bitmap_bytes,
const uint8_t *  key_bytes,
const uint32_t  key_len 
)

Definition at line 1214 of file cuda_mapd_rt.cu.

1217  {
1218  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1219  const uint32_t word_idx = bit_pos / 32;
1220  const uint32_t bit_idx = bit_pos % 32;
1221  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1222 }
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
__device__ int32_t pos_start_impl ( const int32_t *  row_index_resume)

Definition at line 27 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), get_error_code(), group_buff_idx_impl(), and record_error_code().

27  {
28  return blockIdx.x * blockDim.x + threadIdx.x;
29 }

+ Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 35 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

35  {
36  return blockDim.x * gridDim.x;
37 }

+ Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS ( int64_t *  slot,
int64_t  new_val,
int64_t  init_val 
)

Definition at line 1114 of file cuda_mapd_rt.cu.

1116  {
1117  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1118  const auto empty_key =
1119  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1120  const auto new_val_cast =
1121  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1122 
1123  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1124  if (old_val == *empty_key) {
1125  return true;
1126  } else {
1127  return false;
1128  }
1129 }
__device__ bool slotEmptyKeyCAS_int16 ( int16_t *  slot,
int16_t  new_val,
int16_t  init_val 
)

Definition at line 1142 of file cuda_mapd_rt.cu.

1144  {
1145  unsigned int* base_slot_address =
1146  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1147  unsigned int old_value = *base_slot_address;
1148  unsigned int swap_value, compare_value;
1149  do {
1150  compare_value = old_value;
1151  // exit criteria: if init_val does not exist in the slot (some other thread has
1152  // succeeded)
1153  if (static_cast<unsigned int>(init_val) !=
1154  __byte_perm(
1155  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1156  return false;
1157  }
1158  swap_value = __byte_perm(compare_value,
1159  static_cast<unsigned int>(new_val),
1160  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1161  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1162  } while (compare_value != old_value);
1163  return true;
1164 }
__device__ bool slotEmptyKeyCAS_int32 ( int32_t *  slot,
int32_t  new_val,
int32_t  init_val 
)

Definition at line 1131 of file cuda_mapd_rt.cu.

1133  {
1134  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1135  unsigned int compare_value = static_cast<unsigned int>(init_val);
1136  unsigned int swap_value = static_cast<unsigned int>(new_val);
1137 
1138  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1139  return old_value == compare_value;
1140 }
__device__ bool slotEmptyKeyCAS_int8 ( int8_t *  slot,
int8_t  new_val,
int8_t  init_val 
)

Definition at line 1166 of file cuda_mapd_rt.cu.

1168  {
1169  // properly align the slot address:
1170  unsigned int* base_slot_address =
1171  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1172  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1173  unsigned int old_value = *base_slot_address;
1174  unsigned int swap_value, compare_value;
1175  do {
1176  compare_value = old_value;
1177  // exit criteria: if init_val does not exist in the slot (some other thread has
1178  // succeeded)
1179  if (static_cast<unsigned int>(init_val) !=
1180  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1181  return false;
1182  }
1183  swap_value = __byte_perm(compare_value,
1184  static_cast<unsigned int>(new_val),
1185  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1186  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1187  } while (compare_value != old_value);
1188  return true;
1189 }
__device__ uint64_t string_decode ( int8_t *  chunk_iter_,
int64_t  pos 
)

Definition at line 1203 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

1203  {
1204  // TODO(alex): de-dup, the x64 version is basically identical
1205  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1206  VarlenDatum vd;
1207  bool is_end;
1208  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1209  return vd.is_null ? 0
1210  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
1211  (static_cast<uint64_t>(vd.length) << 48);
1212 }
bool is_null
Definition: sqltypes.h:153
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:182
int8_t * pointer
Definition: sqltypes.h:152
size_t length
Definition: sqltypes.h:151

+ Here is the call graph for this function:

__device__ void sync_threadblock ( )

Definition at line 1308 of file cuda_mapd_rt.cu.

Referenced by GpuSharedMemCodeBuilder::codegenInitialization(), and GpuSharedMemCodeBuilder::codegenReduction().

1308  {
1309  __syncthreads();
1310 }

+ Here is the caller graph for this function:

__device__ void sync_warp ( )

Definition at line 1289 of file cuda_mapd_rt.cu.

1289  {
1290  __syncwarp();
1291 }
__device__ void sync_warp_protected ( int64_t  thread_pos,
int64_t  row_count 
)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1300 of file cuda_mapd_rt.cu.

1300  {
1301  // only syncing if NOT within the same warp as those threads experiencing the critical
1302  // edge
1303  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1304  __syncwarp();
1305  }
1306 }
__device__ int8_t thread_warp_idx ( const int8_t  warp_sz)

Definition at line 39 of file cuda_mapd_rt.cu.

39  {
40  return threadIdx.x % warp_sz;
41 }
__device__ void write_back_non_grouped_agg ( int64_t *  input_buffer,
int64_t *  output_buffer,
const int32_t  agg_idx 
)

Definition at line 1320 of file cuda_mapd_rt.cu.

References agg_sum_shared().

1322  {
1323  if (threadIdx.x == agg_idx) {
1324  agg_sum_shared(output_buffer, input_buffer[agg_idx]);
1325  }
1326 }
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ void write_back_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 49 of file cuda_mapd_rt.cu.

49  {
50 }

Variable Documentation

__device__ int32_t dw_abort = 0

Definition at line 94 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), dynamic_watchdog_init(), and Executor::interrupt().

__device__ int64_t dw_cycle_budget = 0

Definition at line 93 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), and dynamic_watchdog_init().

__device__ int64_t dw_sm_cycle_start[128]

Definition at line 91 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

__device__ int32_t runtime_interrupt_flag = 0

Definition at line 95 of file cuda_mapd_rt.cu.

Referenced by check_interrupt(), check_interrupt_init(), and Executor::interrupt().