OmniSciDB  b28c0d5765
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cuda_mapd_rt.cu File Reference
#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "GeoOps.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"
+ Include dependency graph for cuda_mapd_rt.cu:

Go to the source code of this file.

Macros

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu
 
#define DEF_AGG_ID_INT_SHARED(n)
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   int64_t
 
#define ADDR_T   uint64_t
 
#define DATA_T   int32_t
 
#define ADDR_T   uint32_t
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   double
 
#define ADDR_T   uint64_t
 
#define DATA_T   float
 
#define ADDR_T   uint32_t
 
#define EXECUTE_INCLUDE
 

Functions

__device__ int64_t get_thread_index ()
 
__device__ int64_t get_block_index ()
 
__device__ int32_t pos_start_impl (const int32_t *row_index_resume)
 
__device__ int32_t group_buff_idx_impl ()
 
__device__ int32_t pos_step_impl ()
 
__device__ int8_t thread_warp_idx (const int8_t warp_sz)
 
__device__ const int64_t * init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ int64_t * declare_dynamic_shared_memory ()
 
__device__ const int64_t * init_shared_mem (const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
 
__inline__ __device__ uint32_t get_smid (void)
 
__device__ bool dynamic_watchdog ()
 
__device__ bool check_interrupt ()
 
template<typename T = unsigned long long>
__device__ T get_empty_key ()
 
template<>
__device__ unsigned int get_empty_key ()
 
template<typename T >
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
 
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad)
 
template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
 
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width)
 
__device__ int64_t * get_matching_group_value_columnar (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
 
__device__ int64_t atomicMax64 (int64_t *address, int64_t val)
 
__device__ int64_t atomicMin64 (int64_t *address, int64_t val)
 
__device__ double atomicMax (double *address, double val)
 
__device__ float atomicMax (float *address, float val)
 
__device__ double atomicMin (double *address, double val)
 
__device__ double atomicMin (float *address, float val)
 
__device__ uint64_t agg_count_shared (uint64_t *agg, const int64_t val)
 
__device__ uint64_t agg_count_if_shared (uint64_t *agg, const int64_t cond)
 
__device__ uint32_t agg_count_int32_shared (uint32_t *agg, const int32_t val)
 
__device__ uint32_t agg_count_if_int32_shared (uint32_t *agg, const int32_t cond)
 
__device__ uint64_t agg_count_double_shared (uint64_t *agg, const double val)
 
__device__ uint32_t agg_count_float_shared (uint32_t *agg, const float val)
 
__device__ int64_t agg_sum_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t agg_sum_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_sum_float_shared (int32_t *agg, const float val)
 
__device__ void agg_sum_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_max_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_max_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_float_shared (int32_t *agg, const float val)
 
__device__ void agg_min_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_min_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void atomicMax16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMax8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void atomicMin8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_max_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_max_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_min_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_double_shared (int64_t *agg, const double val)
 
__device__ void agg_min_float_shared (int32_t *agg, const float val)
 
__device__ void agg_id_shared (int64_t *agg, const int64_t val)
 
__device__ int8_t * agg_id_varlen_shared (int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
 
__device__ int32_t checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)
 
__device__ void agg_id_double_shared (int64_t *agg, const double val)
 
__device__ int32_t checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)
 
__device__ void agg_id_double_shared_slow (int64_t *agg, const double *val)
 
__device__ int32_t checked_single_agg_id_double_shared_slow (int64_t *agg, const double *valp, const double null_val)
 
__device__ void agg_id_float_shared (int32_t *agg, const float val)
 
__device__ int32_t checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)
 
__device__ void agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ void agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ int32_t atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)
 
__device__ void agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int64_t atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ void agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ float atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)
 
__device__ void agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumFltSkipVal (float *address, const float val, const float skip_val)
 
__device__ void agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumDblSkipVal (double *address, const double val, const double skip_val)
 
__device__ void agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ double atomicMinDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ void agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ bool slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)
 
__device__ bool slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)
 
__device__ bool slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)
 
__device__ bool slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)
 
__device__ uint64_t string_decode (int8_t *chunk_iter_, int64_t pos)
 
__device__ void linear_probabilistic_count (uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
 
__device__ void agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
 
__device__ void force_sync ()
 
__device__ void sync_warp ()
 
__device__ void sync_warp_protected (int64_t thread_pos, int64_t row_count)
 
__device__ void sync_threadblock ()
 
__device__ void write_back_non_grouped_agg (int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
 

Variables

__device__ int64_t dw_sm_cycle_start [128]
 
__device__ int64_t dw_cycle_budget = 0
 
__device__ int32_t dw_abort = 0
 
__device__ int32_t runtime_interrupt_flag = 0
 

Macro Definition Documentation

#define ADDR_T   uint64_t

Definition at line 1017 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1017 of file cuda_mapd_rt.cu.

#define ADDR_T   uint64_t

Definition at line 1017 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1017 of file cuda_mapd_rt.cu.

#define DATA_T   int64_t

Definition at line 1016 of file cuda_mapd_rt.cu.

#define DATA_T   int32_t

Definition at line 1016 of file cuda_mapd_rt.cu.

#define DATA_T   double

Definition at line 1016 of file cuda_mapd_rt.cu.

#define DATA_T   float

Definition at line 1016 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED (   n)
Value:
extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}
constexpr double n
Definition: Utm.h:38

Definition at line 727 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return 0; \
}
#define DATA_T
#define ADDR_T

Definition at line 1001 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return *agg; \
}
#define DATA_T
#define ADDR_T

Definition at line 1001 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1206 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu

Definition at line 82 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu ( int64_t *  agg,
const int64_t  key,
const uint32_t  b,
const int64_t  base_dev_addr,
const int64_t  base_host_addr 
)

Definition at line 1283 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

1288  {
1289  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1290  const uint32_t index = hash >> (64 - b);
1291  const int32_t rank = get_rank(hash << b, 64 - b);
1292  const int64_t host_addr = *agg;
1293  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1294  atomicMax(&M[index], rank);
1295 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1236 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

1242  {
1243  const uint64_t bitmap_idx = val - min_val;
1244  const uint32_t byte_idx = bitmap_idx >> 3;
1245  const uint32_t word_idx = byte_idx >> 2;
1246  const uint32_t byte_word_idx = byte_idx & 3;
1247  const int64_t host_addr = *agg;
1248  uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1249  (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1250  switch (byte_word_idx) {
1251  case 0:
1252  atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1253  break;
1254  case 1:
1255  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1256  break;
1257  case 2:
1258  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1259  break;
1260  case 3:
1261  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1262  break;
1263  default:
1264  break;
1265  }
1266 }

+ Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  skip_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1268 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

1276  {
1277  if (val != skip_val) {
1279  agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1280  }
1281 }
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

+ Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared ( uint64_t *  agg,
const double  val 
)

Definition at line 448 of file cuda_mapd_rt.cu.

References agg_count_shared().

448  {
449  return agg_count_shared(agg, val);
450 }
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared ( uint32_t *  agg,
const float  val 
)

Definition at line 452 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

452  {
453  return agg_count_int32_shared(agg, val);
454 }
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_if_int32_shared ( uint32_t *  agg,
const int32_t  cond 
)

Definition at line 443 of file cuda_mapd_rt.cu.

444  {
445  return cond ? atomicAdd(agg, 1U) : *agg;
446 }
__device__ uint64_t agg_count_if_shared ( uint64_t *  agg,
const int64_t  cond 
)

Definition at line 434 of file cuda_mapd_rt.cu.

434  {
435  return cond ? static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U))
436  : static_cast<uint64_t>(*(reinterpret_cast<uint32_t*>(agg)));
437 }
__device__ uint32_t agg_count_int32_shared ( uint32_t *  agg,
const int32_t  val 
)

Definition at line 439 of file cuda_mapd_rt.cu.

Referenced by agg_count_float_shared().

439  {
440  return atomicAdd(agg, 1U);
441 }

+ Here is the caller graph for this function:

__device__ uint64_t agg_count_shared ( uint64_t *  agg,
const int64_t  val 
)

Definition at line 430 of file cuda_mapd_rt.cu.

Referenced by agg_count_double_shared().

430  {
431  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U));
432 }

+ Here is the caller graph for this function:

__device__ void agg_id_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 739 of file cuda_mapd_rt.cu.

739  {
740  *agg = *(reinterpret_cast<const int64_t*>(&val));
741 }
__device__ void agg_id_double_shared_slow ( int64_t *  agg,
const double *  val 
)

Definition at line 770 of file cuda_mapd_rt.cu.

770  {
771  *agg = *(reinterpret_cast<const int64_t*>(val));
772 }
__device__ void agg_id_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 803 of file cuda_mapd_rt.cu.

803  {
804  *agg = __float_as_int(val);
805 }
__device__ void agg_id_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 686 of file cuda_mapd_rt.cu.

686  {
687  *agg = val;
688 }
__device__ int8_t* agg_id_varlen_shared ( int8_t *  varlen_buffer,
const int64_t  offset,
const int8_t *  value,
const int64_t  size_bytes 
)

Definition at line 690 of file cuda_mapd_rt.cu.

693  {
694  for (auto i = 0; i < size_bytes; i++) {
695  varlen_buffer[offset + i] = value[i];
696  }
697  return &varlen_buffer[offset];
698 }
__device__ void agg_max_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 480 of file cuda_mapd_rt.cu.

References atomicMax().

480  {
481  atomicMax(reinterpret_cast<double*>(agg), val);
482 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1111 of file cuda_mapd_rt.cu.

References atomicMax().

1113  {
1114  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1115  double old = __longlong_as_double(atomicExch(
1116  reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1117  atomicMax(reinterpret_cast<double*>(agg),
1118  __double_as_longlong(old) == __double_as_longlong(skip_val)
1119  ? val
1120  : fmax(old, val));
1121  }
1122 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 484 of file cuda_mapd_rt.cu.

References atomicMax().

484  {
485  atomicMax(reinterpret_cast<float*>(agg), val);
486 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1023 of file cuda_mapd_rt.cu.

References atomicMax().

1025  {
1026  if (__float_as_int(val) != __float_as_int(skip_val)) {
1027  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1028  atomicMax(reinterpret_cast<float*>(agg),
1029  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1030  }
1031 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 662 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

662  {
663  return atomicMax16(agg, val);
664 }
__device__ void atomicMax16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 866 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

868  {
869  if (val != skip_val) {
870  agg_max_int16_shared(agg, val);
871  }
872 }
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 476 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

476  {
477  atomicMax(agg, val);
478 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 858 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

860  {
861  if (val != skip_val) {
862  agg_max_int32_shared(agg, val);
863  }
864 }
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 666 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

666  {
667  return atomicMax8(agg, val);
668 }
__device__ void atomicMax8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 882 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

884  {
885  if (val != skip_val) {
886  agg_max_int8_shared(agg, val);
887  }
888 }
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_max_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 472 of file cuda_mapd_rt.cu.

References atomicMax64().

472  {
473  atomicMax64(agg, val);
474 }
__device__ int64_t atomicMax64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_max_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 992 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

994  {
995  if (val != skip_val) {
996  atomicMax64SkipVal(agg, val, skip_val);
997  }
998 }
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 678 of file cuda_mapd_rt.cu.

References atomicMin().

678  {
679  atomicMin(reinterpret_cast<double*>(agg), val);
680 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1103 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

1105  {
1106  if (val != skip_val) {
1107  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1108  }
1109 }
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 682 of file cuda_mapd_rt.cu.

References atomicMin().

682  {
683  atomicMin(reinterpret_cast<float*>(agg), val);
684 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1040 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

1042  {
1043  if (__float_as_int(val) != __float_as_int(skip_val)) {
1044  atomicMinFltSkipVal(agg, val, skip_val);
1045  }
1046 }
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 670 of file cuda_mapd_rt.cu.

References atomicMin16().

670  {
671  return atomicMin16(agg, val);
672 }
__device__ void atomicMin16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 874 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

876  {
877  if (val != skip_val) {
878  atomicMin16SkipVal(agg, val, skip_val);
879  }
880 }
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 492 of file cuda_mapd_rt.cu.

References atomicMin().

492  {
493  atomicMin(agg, val);
494 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 905 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

907  {
908  if (val != skip_val) {
909  atomicMin32SkipVal(agg, val, skip_val);
910  }
911 }
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 674 of file cuda_mapd_rt.cu.

References atomicMin8().

674  {
675  return atomicMin8(agg, val);
676 }
__device__ void atomicMin8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 890 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

892  {
893  if (val != skip_val) {
894  atomicMin8SkipVal(agg, val, skip_val);
895  }
896 }
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 488 of file cuda_mapd_rt.cu.

References atomicMin64().

488  {
489  atomicMin64(agg, val);
490 }
__device__ int64_t atomicMin64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_min_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 967 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

969  {
970  if (val != skip_val) {
971  atomicMin64SkipVal(agg, val, skip_val);
972  }
973 }
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 468 of file cuda_mapd_rt.cu.

468  {
469  atomicAdd(reinterpret_cast<double*>(agg), val);
470 }
__device__ void agg_sum_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1073 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

1075  {
1076  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1077  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1078  }
1079 }
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 464 of file cuda_mapd_rt.cu.

464  {
465  atomicAdd(reinterpret_cast<float*>(agg), val);
466 }
__device__ void agg_sum_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1055 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

1057  {
1058  if (__float_as_int(val) != __float_as_int(skip_val)) {
1059  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
1060  }
1061 }
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 460 of file cuda_mapd_rt.cu.

460  {
461  return atomicAdd(agg, val);
462 }
__device__ int32_t agg_sum_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 922 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

924  {
925  if (val != skip_val) {
926  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
927  return old;
928  }
929  return 0;
930 }
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 456 of file cuda_mapd_rt.cu.

Referenced by write_back_non_grouped_agg().

456  {
457  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
458 }

+ Here is the caller graph for this function:

__device__ int64_t agg_sum_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 941 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

943  {
944  if (val != skip_val) {
945  return atomicSum64SkipVal(agg, val, skip_val);
946  }
947  return 0;
948 }
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ double atomicMax ( double *  address,
double  val 
)

Definition at line 372 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_double_skip_val_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

372  {
373  unsigned long long int* address_as_ull = (unsigned long long int*)address;
374  unsigned long long int old = *address_as_ull, assumed;
375 
376  do {
377  assumed = old;
378  old = atomicCAS(address_as_ull,
379  assumed,
380  __double_as_longlong(max(val, __longlong_as_double(assumed))));
381 
382  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
383  } while (assumed != old);
384 
385  return __longlong_as_double(old);
386 }

+ Here is the caller graph for this function:

__device__ float atomicMax ( float *  address,
float  val 
)

Definition at line 388 of file cuda_mapd_rt.cu.

388  {
389  int* address_as_int = (int*)address;
390  int old = *address_as_int, assumed;
391 
392  do {
393  assumed = old;
394  old = atomicCAS(
395  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
396 
397  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
398  } while (assumed != old);
399 
400  return __int_as_float(old);
401 }
__device__ void atomicMax16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 510 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

510  {
511  // properly align the input pointer:
512  unsigned int* base_address_u32 =
513  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
514 
515  unsigned int old_value = *base_address_u32;
516  unsigned int swap_value, compare_value;
517  do {
518  compare_value = old_value;
519  swap_value =
520  (reinterpret_cast<size_t>(agg) & 0x2)
521  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
522  << 16 |
523  (old_value & 0xFFFF)
524  : (old_value & 0xFFFF0000) |
525  static_cast<unsigned int>(
526  max(static_cast<int16_t>(old_value & 0xFFFF), val));
527  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
528  } while (old_value != compare_value);
529 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64 ( int64_t *  address,
int64_t  val 
)

Definition at line 330 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

330  {
331  unsigned long long int* address_as_ull = (unsigned long long int*)address;
332  unsigned long long int old = *address_as_ull, assumed;
333 
334  do {
335  assumed = old;
336  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
337  } while (assumed != old);
338 
339  return old;
340 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 975 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

977  {
978  unsigned long long int* address_as_ull =
979  reinterpret_cast<unsigned long long int*>(address);
980  unsigned long long int old = *address_as_ull, assumed;
981 
982  do {
983  assumed = old;
984  old = atomicCAS(address_as_ull,
985  assumed,
986  assumed == skip_val ? val : max((long long)val, (long long)assumed));
987  } while (assumed != old);
988 
989  return old;
990 }

+ Here is the caller graph for this function:

__device__ void atomicMax8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 532 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

532  {
533  // properly align the input pointer:
534  unsigned int* base_address_u32 =
535  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
536 
537  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
538  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
539  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
540  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
541  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
542  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
543  unsigned int old_value = *base_address_u32;
544  unsigned int swap_value, compare_value;
545  do {
546  compare_value = old_value;
547  auto max_value = static_cast<unsigned int>(
548  // compare val with its corresponding bits in the compare_value
549  max(val,
550  static_cast<int8_t>(__byte_perm(
551  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
552  swap_value = __byte_perm(
553  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
554  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
555  } while (compare_value != old_value);
556 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( double *  address,
double  val 
)

Definition at line 403 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

403  {
404  unsigned long long int* address_as_ull = (unsigned long long int*)address;
405  unsigned long long int old = *address_as_ull, assumed;
406 
407  do {
408  assumed = old;
409  old = atomicCAS(address_as_ull,
410  assumed,
411  __double_as_longlong(min(val, __longlong_as_double(assumed))));
412  } while (assumed != old);
413 
414  return __longlong_as_double(old);
415 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( float *  address,
float  val 
)

Definition at line 417 of file cuda_mapd_rt.cu.

417  {
418  int* address_as_ull = (int*)address;
419  int old = *address_as_ull, assumed;
420 
421  do {
422  assumed = old;
423  old = atomicCAS(
424  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
425  } while (assumed != old);
426 
427  return __int_as_float(old);
428 }
__device__ void atomicMin16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 572 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

572  {
573  // properly align the input pointer:
574  unsigned int* base_address_u32 =
575  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
576 
577  unsigned int old_value = *base_address_u32;
578  unsigned int swap_value, compare_value;
579  do {
580  compare_value = old_value;
581  swap_value =
582  (reinterpret_cast<size_t>(agg) & 0x2)
583  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
584  << 16 |
585  (old_value & 0xFFFF)
586  : (old_value & 0xFFFF0000) |
587  static_cast<unsigned int>(
588  min(static_cast<int16_t>(old_value & 0xFFFF), val));
589  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
590  } while (old_value != compare_value);
591 }

+ Here is the caller graph for this function:

__device__ void atomicMin16SkipVal ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 594 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

596  {
597  // properly align the input pointer:
598  unsigned int* base_address_u32 =
599  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
600 
601  unsigned int old_value = *base_address_u32;
602  unsigned int swap_value, compare_value;
603  do {
604  compare_value = old_value;
605  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
606  ? static_cast<int16_t>(old_value >> 16)
607  : static_cast<int16_t>(old_value & 0xFFFF);
608 
609  swap_value =
610  (reinterpret_cast<size_t>(agg) & 0x2)
611  ? static_cast<unsigned int>(
612  selected_old_val == skip_val ? val : min(selected_old_val, val))
613  << 16 |
614  (old_value & 0xFFFF)
615  : (old_value & 0xFFFF0000) |
616  static_cast<unsigned int>(
617  selected_old_val == skip_val ? val : min(selected_old_val, val));
618  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
619  } while (old_value != compare_value);
620 }

+ Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal ( int32_t *  address,
int32_t  val,
const int32_t  skip_val 
)

Definition at line 898 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

900  {
901  int32_t old = atomicExch(address, INT_MAX);
902  return atomicMin(address, old == skip_val ? val : min(old, val));
903 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64 ( int64_t *  address,
int64_t  val 
)

Definition at line 342 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

342  {
343  unsigned long long int* address_as_ull = (unsigned long long int*)address;
344  unsigned long long int old = *address_as_ull, assumed;
345 
346  do {
347  assumed = old;
348  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
349  } while (assumed != old);
350 
351  return old;
352 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 950 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

952  {
953  unsigned long long int* address_as_ull =
954  reinterpret_cast<unsigned long long int*>(address);
955  unsigned long long int old = *address_as_ull, assumed;
956 
957  do {
958  assumed = old;
959  old = atomicCAS(address_as_ull,
960  assumed,
961  assumed == skip_val ? val : min((long long)val, (long long)assumed));
962  } while (assumed != old);
963 
964  return old;
965 }

+ Here is the caller graph for this function:

__device__ void atomicMin8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 622 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

622  {
623  // properly align the input pointer:
624  unsigned int* base_address_u32 =
625  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
626 
627  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
628  unsigned int old_value = *base_address_u32;
629  unsigned int swap_value, compare_value;
630  do {
631  compare_value = old_value;
632  auto min_value = static_cast<unsigned int>(
633  min(val,
634  static_cast<int8_t>(__byte_perm(
635  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
636  swap_value = __byte_perm(
637  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
638  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
639  } while (compare_value != old_value);
640 }

+ Here is the caller graph for this function:

__device__ void atomicMin8SkipVal ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 642 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

642  {
643  // properly align the input pointer:
644  unsigned int* base_address_u32 =
645  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
646 
647  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
648  unsigned int old_value = *base_address_u32;
649  unsigned int swap_value, compare_value;
650  do {
651  compare_value = old_value;
652  int8_t selected_old_val = static_cast<int8_t>(
653  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
654  auto min_value = static_cast<unsigned int>(
655  selected_old_val == skip_val ? val : min(val, selected_old_val));
656  swap_value = __byte_perm(
657  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
658  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
659  } while (compare_value != old_value);
660 }

+ Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1081 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

1083  {
1084  unsigned long long int* address_as_ull =
1085  reinterpret_cast<unsigned long long int*>(address);
1086  unsigned long long int old = *address_as_ull;
1087  unsigned long long int skip_val_as_ull =
1088  *reinterpret_cast<const unsigned long long*>(&skip_val);
1089  unsigned long long int assumed;
1090 
1091  do {
1092  assumed = old;
1093  old = atomicCAS(address_as_ull,
1094  assumed,
1095  assumed == skip_val_as_ull
1096  ? *reinterpret_cast<unsigned long long*>(&val)
1097  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1098  } while (assumed != old);
1099 
1100  return __longlong_as_double(old);
1101 }

+ Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal ( int32_t *  address,
float  val,
const float  skip_val 
)

Definition at line 1033 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

1033  {
1034  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1035  return atomicMin(
1036  reinterpret_cast<float*>(address),
1037  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1038 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal ( int32_t *  address,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 913 of file cuda_mapd_rt.cu.

Referenced by agg_sum_int32_skip_val_shared().

915  {
916  unsigned int* address_as_int = (unsigned int*)address;
917  int32_t old = atomicExch(address_as_int, 0);
918  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
919  return old == skip_val ? old2 : (old2 + old);
920 }

+ Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal ( int64_t *  address,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 932 of file cuda_mapd_rt.cu.

Referenced by agg_sum_skip_val_shared().

934  {
935  unsigned long long int* address_as_ull = (unsigned long long int*)address;
936  int64_t old = atomicExch(address_as_ull, 0);
937  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
938  return old == skip_val ? old2 : (old2 + old);
939 }

+ Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal ( double *  address,
const double  val,
const double  skip_val 
)

Definition at line 1063 of file cuda_mapd_rt.cu.

Referenced by agg_sum_double_skip_val_shared().

1065  {
1066  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1067  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1068  atomicAdd(
1069  address,
1070  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1071 }

+ Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal ( float *  address,
const float  val,
const float  skip_val 
)

Definition at line 1048 of file cuda_mapd_rt.cu.

References anonymous_namespace{Utm.h}::f.

Referenced by agg_sum_float_skip_val_shared().

1050  {
1051  float old = atomicExch(address, 0.f);
1052  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1053 }
constexpr double f
Definition: Utm.h:31

+ Here is the caller graph for this function:

__device__ bool check_interrupt ( )

Definition at line 159 of file cuda_mapd_rt.cu.

Referenced by check_interrupt_rt(), and ColumnFetcher::linearizeFixedLenArrayColFrags().

159  {
160  return (runtime_interrupt_flag == 1) ? true : false;
161 }
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95

+ Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared ( int64_t *  agg,
const double  val,
const double  null_val 
)

Definition at line 743 of file cuda_mapd_rt.cu.

745  {
746  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
747  unsigned long long int old = *address_as_ull, assumed;
748 
749  if (val == null_val) {
750  return 0;
751  }
752 
753  do {
754  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
755  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
756  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
757  return 15;
758  } else {
759  break;
760  }
761  }
762 
763  assumed = old;
764  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
765  } while (assumed != old);
766 
767  return 0;
768 }
__device__ int32_t checked_single_agg_id_double_shared_slow ( int64_t *  agg,
const double *  valp,
const double  null_val 
)

Definition at line 775 of file cuda_mapd_rt.cu.

777  {
778  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
779  unsigned long long int old = *address_as_ull, assumed;
780  double val = *valp;
781 
782  if (val == null_val) {
783  return 0;
784  }
785 
786  do {
787  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
788  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
789  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
790  return 15;
791  } else {
792  break;
793  }
794  }
795 
796  assumed = old;
797  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
798  } while (assumed != old);
799 
800  return 0;
801 }
__device__ int32_t checked_single_agg_id_float_shared ( int32_t *  agg,
const float  val,
const float  null_val 
)

Definition at line 807 of file cuda_mapd_rt.cu.

809  {
810  int* address_as_ull = reinterpret_cast<int*>(agg);
811  int old = *address_as_ull, assumed;
812 
813  if (val == null_val) {
814  return 0;
815  }
816 
817  do {
818  if (old != __float_as_int(null_val)) {
819  if (old != __float_as_int(val)) {
820  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
821  return 15;
822  } else {
823  break;
824  }
825  }
826 
827  assumed = old;
828  old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
829  } while (assumed != old);
830 
831  return 0;
832 }
__device__ int32_t checked_single_agg_id_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  null_val 
)

Definition at line 700 of file cuda_mapd_rt.cu.

702  {
703  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
704  unsigned long long int old = *address_as_ull, assumed;
705 
706  if (val == null_val) {
707  return 0;
708  }
709 
710  do {
711  if (static_cast<int64_t>(old) != null_val) {
712  if (static_cast<int64_t>(old) != val) {
713  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
714  return 15;
715  } else {
716  break;
717  }
718  }
719 
720  assumed = old;
721  old = atomicCAS(address_as_ull, assumed, val);
722  } while (assumed != old);
723 
724  return 0;
725 }
__device__ int64_t* declare_dynamic_shared_memory ( )

Definition at line 56 of file cuda_mapd_rt.cu.

56  {
57  extern __shared__ int64_t shared_mem_buffer[];
58  return shared_mem_buffer;
59 }
__device__ bool dynamic_watchdog ( )

Definition at line 115 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), anonymous_namespace{ResultSetReduction.cpp}::check_watchdog_with_seed(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

115  {
116  // check for dynamic watchdog, if triggered all threads return true
117  if (dw_cycle_budget == 0LL) {
118  return false; // Uninitialized watchdog can't check time
119  }
120  if (dw_abort == 1) {
121  return true; // Received host request to abort
122  }
123  uint32_t smid = get_smid();
124  if (smid >= 128) {
125  return false;
126  }
127  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
128  __shared__ volatile bool
129  dw_should_terminate; // all threads within a block should return together if
130  // watchdog criteria is met
131 
132  // thread 0 either initializes or read the initial clock cycle, the result is stored
133  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
134  // point in using more threads here.
135  if (threadIdx.x == 0) {
136  dw_block_cycle_start = 0LL;
137  int64_t cycle_count = static_cast<int64_t>(clock64());
138  // Make sure the block hasn't switched SMs
139  if (smid == get_smid()) {
140  dw_block_cycle_start = static_cast<int64_t>(
141  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
142  0ULL,
143  static_cast<unsigned long long>(cycle_count)));
144  }
145 
146  int64_t cycles = cycle_count - dw_block_cycle_start;
147  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
148  (cycles > dw_cycle_budget)) {
149  // Check if we're out of time on this particular SM
150  dw_should_terminate = true;
151  } else {
152  dw_should_terminate = false;
153  }
154  }
155  __syncthreads();
156  return dw_should_terminate;
157 }
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
__inline__ __device__ uint32_t get_smid(void)
Definition: cuda_mapd_rt.cu:97
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94

+ Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1297 of file cuda_mapd_rt.cu.

1297  {
1298  __threadfence_block();
1299 }
__device__ int64_t get_block_index ( )

Definition at line 23 of file cuda_mapd_rt.cu.

23  {
24  return blockIdx.x;
25 }
template<typename T = unsigned long long>
__device__ T get_empty_key ( )
inline

Definition at line 164 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

164  {
165  return EMPTY_KEY_64;
166 }
#define EMPTY_KEY_64
template<>
__device__ unsigned int get_empty_key ( )
inline

Definition at line 169 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

169  {
170  return EMPTY_KEY_32;
171 }
#define EMPTY_KEY_32
template<typename T >
__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const T *  key,
const uint32_t  key_count,
const uint32_t  row_size_quad 
)
inline

Definition at line 174 of file cuda_mapd_rt.cu.

References align_to_int64(), and heavydb.dtypes::T.

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

178  {
179  const T empty_key = get_empty_key<T>();
180  uint32_t off = h * row_size_quad;
181  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
182  {
183  const T old = atomicCAS(row_ptr, empty_key, *key);
184  if (empty_key == old && key_count > 1) {
185  for (size_t i = 1; i <= key_count - 1; ++i) {
186  atomicExch(row_ptr + i, key[i]);
187  }
188  }
189  }
190  if (key_count > 1) {
191  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
192  // spin until the winning thread has finished writing the entire key and the init
193  // value
194  }
195  }
196  bool match = true;
197  for (uint32_t i = 0; i < key_count; ++i) {
198  if (row_ptr[i] != key[i]) {
199  match = false;
200  break;
201  }
202  }
203 
204  if (match) {
205  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
206  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
207  }
208  return NULL;
209 }
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad 
)

Definition at line 211 of file cuda_mapd_rt.cu.

References get_matching_group_value().

216  {
217  switch (key_width) {
218  case 4:
219  return get_matching_group_value(groups_buffer,
220  h,
221  reinterpret_cast<const unsigned int*>(key),
222  key_count,
223  row_size_quad);
224  case 8:
225  return get_matching_group_value(groups_buffer,
226  h,
227  reinterpret_cast<const unsigned long long*>(key),
228  key_count,
229  row_size_quad);
230  default:
231  return NULL;
232  }
233 }
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)

+ Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_qw_count,
const size_t  entry_count 
)

Definition at line 296 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

301  {
302  uint32_t off = h;
303  {
304  const uint64_t old = atomicCAS(
305  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
306  if (EMPTY_KEY_64 == old) {
307  for (size_t i = 0; i < key_qw_count; ++i) {
308  groups_buffer[off] = key[i];
309  off += entry_count;
310  }
311  return &groups_buffer[off];
312  }
313  }
314  __syncthreads();
315  off = h;
316  for (size_t i = 0; i < key_qw_count; ++i) {
317  if (groups_buffer[off] != key[i]) {
318  return NULL;
319  }
320  off += entry_count;
321  }
322  return &groups_buffer[off];
323 }
#define EMPTY_KEY_64

+ Here is the caller graph for this function:

template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const T *  key,
const uint32_t  key_count 
)

Definition at line 236 of file cuda_mapd_rt.cu.

References heavydb.dtypes::T.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

240  {
241  const T empty_key = get_empty_key<T>();
242  const uint64_t old =
243  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
244  // the winner thread proceeds with writing the rest fo the keys
245  if (old == empty_key) {
246  uint32_t offset = h + entry_count;
247  for (size_t i = 1; i < key_count; ++i) {
248  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
249  offset += entry_count;
250  }
251  }
252 
253  __threadfence();
254  // for all threads except the winning thread, memory content of the keys
255  // related to the hash offset are checked again. In case of a complete match
256  // the hash offset is returned, otherwise -1 is returned
257  if (old != empty_key) {
258  uint32_t offset = h;
259  for (uint32_t i = 0; i < key_count; ++i) {
260  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
261  return -1;
262  }
263  offset += entry_count;
264  }
265  }
266  return h;
267 }

+ Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width 
)

Definition at line 270 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

275  {
276  switch (key_width) {
277  case 4:
279  groups_buffer,
280  entry_count,
281  h,
282  reinterpret_cast<const unsigned int*>(key),
283  key_count);
284  case 8:
286  groups_buffer,
287  entry_count,
288  h,
289  reinterpret_cast<const unsigned long long*>(key),
290  key_count);
291  default:
292  return -1;
293  }
294 }
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)

+ Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void  )

Definition at line 97 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

97  {
98  uint32_t ret;
99  asm("mov.u32 %0, %%smid;" : "=r"(ret));
100  return ret;
101 }

+ Here is the caller graph for this function:

__device__ int64_t get_thread_index ( )

Definition at line 19 of file cuda_mapd_rt.cu.

19  {
20  return threadIdx.x;
21 }
__device__ int32_t group_buff_idx_impl ( )

Definition at line 31 of file cuda_mapd_rt.cu.

References pos_start_impl().

31  {
32  return pos_start_impl(NULL);
33 }
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem ( const int64_t *  global_groups_buffer,
const int32_t  groups_buffer_size 
)

Initializes the shared memory buffer for perfect hash group by. In this function, we simply copy the global group by buffer (already initialized on the host and transferred) to all shared memory group by buffers.

Definition at line 66 of file cuda_mapd_rt.cu.

67  {
68  // dynamic shared memory declaration
69  extern __shared__ int64_t shared_groups_buffer[];
70 
71  // it is assumed that buffer size is aligned with 64-bit units
72  // so it is safe to assign 64-bit to each thread
73  const int32_t buffer_units = groups_buffer_size >> 3;
74 
75  for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
76  shared_groups_buffer[pos] = global_groups_buffer[pos];
77  }
78  __syncthreads();
79  return shared_groups_buffer;
80 }
__device__ const int64_t* init_shared_mem_nop ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 43 of file cuda_mapd_rt.cu.

45  {
46  return groups_buffer;
47 }
__device__ void linear_probabilistic_count ( uint8_t *  bitmap,
const uint32_t  bitmap_bytes,
const uint8_t *  key_bytes,
const uint32_t  key_len 
)

Definition at line 1226 of file cuda_mapd_rt.cu.

1229  {
1230  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1231  const uint32_t word_idx = bit_pos / 32;
1232  const uint32_t bit_idx = bit_pos % 32;
1233  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1234 }
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
__device__ int32_t pos_start_impl ( const int32_t *  row_index_resume)

Definition at line 27 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), get_error_code(), group_buff_idx_impl(), and record_error_code().

27  {
28  return blockIdx.x * blockDim.x + threadIdx.x;
29 }

+ Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 35 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

35  {
36  return blockDim.x * gridDim.x;
37 }

+ Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS ( int64_t *  slot,
int64_t  new_val,
int64_t  init_val 
)

Definition at line 1126 of file cuda_mapd_rt.cu.

1128  {
1129  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1130  const auto empty_key =
1131  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1132  const auto new_val_cast =
1133  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1134 
1135  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1136  if (old_val == *empty_key) {
1137  return true;
1138  } else {
1139  return false;
1140  }
1141 }
__device__ bool slotEmptyKeyCAS_int16 ( int16_t *  slot,
int16_t  new_val,
int16_t  init_val 
)

Definition at line 1154 of file cuda_mapd_rt.cu.

1156  {
1157  unsigned int* base_slot_address =
1158  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1159  unsigned int old_value = *base_slot_address;
1160  unsigned int swap_value, compare_value;
1161  do {
1162  compare_value = old_value;
1163  // exit criteria: if init_val does not exist in the slot (some other thread has
1164  // succeeded)
1165  if (static_cast<unsigned int>(init_val) !=
1166  __byte_perm(
1167  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1168  return false;
1169  }
1170  swap_value = __byte_perm(compare_value,
1171  static_cast<unsigned int>(new_val),
1172  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1173  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1174  } while (compare_value != old_value);
1175  return true;
1176 }
__device__ bool slotEmptyKeyCAS_int32 ( int32_t *  slot,
int32_t  new_val,
int32_t  init_val 
)

Definition at line 1143 of file cuda_mapd_rt.cu.

1145  {
1146  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1147  unsigned int compare_value = static_cast<unsigned int>(init_val);
1148  unsigned int swap_value = static_cast<unsigned int>(new_val);
1149 
1150  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1151  return old_value == compare_value;
1152 }
__device__ bool slotEmptyKeyCAS_int8 ( int8_t *  slot,
int8_t  new_val,
int8_t  init_val 
)

Definition at line 1178 of file cuda_mapd_rt.cu.

1180  {
1181  // properly align the slot address:
1182  unsigned int* base_slot_address =
1183  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1184  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1185  unsigned int old_value = *base_slot_address;
1186  unsigned int swap_value, compare_value;
1187  do {
1188  compare_value = old_value;
1189  // exit criteria: if init_val does not exist in the slot (some other thread has
1190  // succeeded)
1191  if (static_cast<unsigned int>(init_val) !=
1192  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1193  return false;
1194  }
1195  swap_value = __byte_perm(compare_value,
1196  static_cast<unsigned int>(new_val),
1197  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1198  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1199  } while (compare_value != old_value);
1200  return true;
1201 }
__device__ uint64_t string_decode ( int8_t *  chunk_iter_,
int64_t  pos 
)

Definition at line 1215 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

1215  {
1216  // TODO(alex): de-dup, the x64 version is basically identical
1217  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1218  VarlenDatum vd;
1219  bool is_end;
1220  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1221  return vd.is_null ? 0
1222  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
1223  (static_cast<uint64_t>(vd.length) << 48);
1224 }
bool is_null
Definition: Datum.h:35
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:182
int8_t * pointer
Definition: Datum.h:34
size_t length
Definition: Datum.h:33

+ Here is the call graph for this function:

__device__ void sync_threadblock ( )

Definition at line 1320 of file cuda_mapd_rt.cu.

Referenced by GpuSharedMemCodeBuilder::codegenInitialization(), and GpuSharedMemCodeBuilder::codegenReduction().

1320  {
1321  __syncthreads();
1322 }

+ Here is the caller graph for this function:

__device__ void sync_warp ( )

Definition at line 1301 of file cuda_mapd_rt.cu.

1301  {
1302  __syncwarp();
1303 }
__device__ void sync_warp_protected ( int64_t  thread_pos,
int64_t  row_count 
)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1312 of file cuda_mapd_rt.cu.

1312  {
1313  // only syncing if NOT within the same warp as those threads experiencing the critical
1314  // edge
1315  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1316  __syncwarp();
1317  }
1318 }
__device__ int8_t thread_warp_idx ( const int8_t  warp_sz)

Definition at line 39 of file cuda_mapd_rt.cu.

39  {
40  return threadIdx.x % warp_sz;
41 }
__device__ void write_back_non_grouped_agg ( int64_t *  input_buffer,
int64_t *  output_buffer,
const int32_t  agg_idx 
)

Definition at line 1332 of file cuda_mapd_rt.cu.

References agg_sum_shared().

1334  {
1335  if (threadIdx.x == agg_idx) {
1336  agg_sum_shared(output_buffer, input_buffer[agg_idx]);
1337  }
1338 }
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ void write_back_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 49 of file cuda_mapd_rt.cu.

49  {
50 }

Variable Documentation

__device__ int32_t dw_abort = 0

Definition at line 94 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), dynamic_watchdog_init(), and Executor::interrupt().

__device__ int64_t dw_cycle_budget = 0

Definition at line 93 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), and dynamic_watchdog_init().

__device__ int64_t dw_sm_cycle_start[128]

Definition at line 91 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

__device__ int32_t runtime_interrupt_flag = 0

Definition at line 95 of file cuda_mapd_rt.cu.

Referenced by check_interrupt(), check_interrupt_init(), and Executor::interrupt().