OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cuda_mapd_rt.cu File Reference
#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "GeoOps.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"
+ Include dependency graph for cuda_mapd_rt.cu:

Go to the source code of this file.

Macros

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu
 
#define DEF_AGG_ID_INT_SHARED(n)
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   int64_t
 
#define ADDR_T   uint64_t
 
#define DATA_T   int32_t
 
#define ADDR_T   uint32_t
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   double
 
#define ADDR_T   uint64_t
 
#define DATA_T   float
 
#define ADDR_T   uint32_t
 
#define EXECUTE_INCLUDE
 

Functions

__device__ int64_t get_thread_index ()
 
__device__ int64_t get_block_index ()
 
__device__ int32_t pos_start_impl (const int32_t *row_index_resume)
 
__device__ int32_t group_buff_idx_impl ()
 
__device__ int32_t pos_step_impl ()
 
__device__ int8_t thread_warp_idx (const int8_t warp_sz)
 
__device__ const int64_t * init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ int64_t * declare_dynamic_shared_memory ()
 
__device__ const int64_t * init_shared_mem (const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
 
__inline__ __device__ uint32_t get_smid (void)
 
__device__ bool dynamic_watchdog ()
 
__device__ bool check_interrupt ()
 
template<typename T = unsigned long long>
__device__ T get_empty_key ()
 
template<>
__device__ unsigned int get_empty_key ()
 
template<typename T >
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
 
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad)
 
template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
 
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width)
 
__device__ int64_t * get_matching_group_value_columnar (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
 
__device__ int64_t atomicMax64 (int64_t *address, int64_t val)
 
__device__ int64_t atomicMin64 (int64_t *address, int64_t val)
 
__device__ double atomicMax (double *address, double val)
 
__device__ float atomicMax (float *address, float val)
 
__device__ double atomicMin (double *address, double val)
 
__device__ double atomicMin (float *address, float val)
 
__device__ uint64_t agg_count_shared (uint64_t *agg, const int64_t val)
 
__device__ uint64_t agg_count_if_shared (uint64_t *agg, const int64_t cond)
 
__device__ uint32_t agg_count_int32_shared (uint32_t *agg, const int32_t val)
 
__device__ uint32_t agg_count_if_int32_shared (uint32_t *agg, const int32_t cond)
 
__device__ uint64_t agg_count_double_shared (uint64_t *agg, const double val)
 
__device__ uint32_t agg_count_float_shared (uint32_t *agg, const float val)
 
__device__ int64_t agg_sum_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t agg_sum_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_sum_float_shared (int32_t *agg, const float val)
 
__device__ void agg_sum_double_shared (int64_t *agg, const double val)
 
__device__ int64_t agg_sum_if_shared (int64_t *agg, const int64_t val, const int8_t cond)
 
__device__ int32_t agg_sum_if_int32_shared (int32_t *agg, const int32_t val, const int8_t cond)
 
__device__ void agg_sum_if_float_shared (int32_t *agg, const float val, const int8_t cond)
 
__device__ void agg_sum_if_double_shared (int64_t *agg, const double val, const int8_t cond)
 
__device__ void agg_max_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_max_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_max_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_float_shared (int32_t *agg, const float val)
 
__device__ void agg_min_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_min_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void atomicMax16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMax8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void atomicMin8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_max_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_max_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_min_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_double_shared (int64_t *agg, const double val)
 
__device__ void agg_min_float_shared (int32_t *agg, const float val)
 
__device__ void agg_id_shared (int64_t *agg, const int64_t val)
 
__device__ int8_t * agg_id_varlen_shared (int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
 
__device__ int32_t checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)
 
__device__ void agg_id_double_shared (int64_t *agg, const double val)
 
__device__ int32_t checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)
 
__device__ void agg_id_double_shared_slow (int64_t *agg, const double *val)
 
__device__ int32_t checked_single_agg_id_double_shared_slow (int64_t *agg, const double *valp, const double null_val)
 
__device__ void agg_id_float_shared (int32_t *agg, const float val)
 
__device__ int32_t checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)
 
__device__ void agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ void agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ int32_t atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)
 
__device__ void agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_if_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)
 
__device__ int64_t atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_if_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)
 
__device__ int64_t atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ void agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ float atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)
 
__device__ void agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumFltSkipVal (float *address, const float val, const float skip_val)
 
__device__ void agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void agg_sum_if_float_skip_val_shared (int32_t *agg, const float val, const float skip_val, const int8_t cond)
 
__device__ void atomicSumDblSkipVal (double *address, const double val, const double skip_val)
 
__device__ void agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ void agg_sum_if_double_skip_val_shared (int64_t *agg, const double val, const double skip_val, const int8_t cond)
 
__device__ double atomicMinDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ void agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ bool slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)
 
__device__ bool slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)
 
__device__ bool slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)
 
__device__ bool slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)
 
__device__ StringView string_decode (int8_t *chunk_iter_, int64_t pos)
 
__device__ void linear_probabilistic_count (uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
 
__device__ void agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
 
__device__ void force_sync ()
 
__device__ void sync_warp ()
 
__device__ void sync_warp_protected (int64_t thread_pos, int64_t row_count)
 
__device__ void sync_threadblock ()
 
__device__ void write_back_non_grouped_agg (int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
 

Variables

__device__ int64_t dw_sm_cycle_start [128]
 
__device__ int64_t dw_cycle_budget = 0
 
__device__ int32_t dw_abort = 0
 
__device__ int32_t runtime_interrupt_flag = 0
 

Macro Definition Documentation

#define ADDR_T   uint64_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T   uint64_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define DATA_T   int64_t

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T   int32_t

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T   double

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T   float

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED (   n)
Value:
extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}
constexpr double n
Definition: Utm.h:38

Definition at line 762 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return 0; \
}
#define DATA_T
#define ADDR_T

Definition at line 1050 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return *agg; \
}
#define DATA_T
#define ADDR_T

Definition at line 1050 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1273 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu

Definition at line 82 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu ( int64_t *  agg,
const int64_t  key,
const uint32_t  b,
const int64_t  base_dev_addr,
const int64_t  base_host_addr 
)

Definition at line 1346 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

1351  {
1352  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1353  const uint32_t index = hash >> (64 - b);
1354  const int32_t rank = get_rank(hash << b, 64 - b);
1355  const int64_t host_addr = *agg;
1356  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1357  atomicMax(&M[index], rank);
1358 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  bucket_size,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1303 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

1310  {
1311  constexpr unsigned bitmap_element_size = 8 * sizeof(uint32_t);
1312  auto bitmap_idx = static_cast<uint64_t>(val - min_val);
1313  if (1 < bucket_size) {
1314  bitmap_idx /= static_cast<uint64_t>(bucket_size);
1315  }
1316  uint64_t const word_idx = bitmap_idx / bitmap_element_size;
1317  uint32_t const bit_idx = bitmap_idx % bitmap_element_size;
1318  int64_t const agg_offset = *agg - base_host_addr;
1319  int64_t const thread_offset = (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes;
1320  auto* bitmap = reinterpret_cast<uint32_t*>(base_dev_addr + agg_offset + thread_offset);
1321  atomicOr(bitmap + word_idx, 1u << bit_idx);
1322 }

+ Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  bucket_size,
const int64_t  skip_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1324 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

1333  {
1334  if (val != skip_val) {
1336  val,
1337  min_val,
1338  bucket_size,
1339  base_dev_addr,
1340  base_host_addr,
1341  sub_bitmap_count,
1342  bitmap_bytes);
1343  }
1344 }
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

+ Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared ( uint64_t *  agg,
const double  val 
)

Definition at line 448 of file cuda_mapd_rt.cu.

References agg_count_shared().

448  {
449  return agg_count_shared(agg, val);
450 }
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared ( uint32_t *  agg,
const float  val 
)

Definition at line 452 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

452  {
453  return agg_count_int32_shared(agg, val);
454 }
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_if_int32_shared ( uint32_t *  agg,
const int32_t  cond 
)

Definition at line 443 of file cuda_mapd_rt.cu.

444  {
445  return cond ? atomicAdd(agg, 1U) : *agg;
446 }
__device__ uint64_t agg_count_if_shared ( uint64_t *  agg,
const int64_t  cond 
)

Definition at line 434 of file cuda_mapd_rt.cu.

434  {
435  return cond ? static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U))
436  : static_cast<uint64_t>(*(reinterpret_cast<uint32_t*>(agg)));
437 }
__device__ uint32_t agg_count_int32_shared ( uint32_t *  agg,
const int32_t  val 
)

Definition at line 439 of file cuda_mapd_rt.cu.

Referenced by agg_count_float_shared().

439  {
440  return atomicAdd(agg, 1U);
441 }

+ Here is the caller graph for this function:

__device__ uint64_t agg_count_shared ( uint64_t *  agg,
const int64_t  val 
)

Definition at line 430 of file cuda_mapd_rt.cu.

Referenced by agg_count_double_shared().

430  {
431  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U));
432 }

+ Here is the caller graph for this function:

__device__ void agg_id_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 774 of file cuda_mapd_rt.cu.

774  {
775  *agg = *(reinterpret_cast<const int64_t*>(&val));
776 }
__device__ void agg_id_double_shared_slow ( int64_t *  agg,
const double *  val 
)

Definition at line 805 of file cuda_mapd_rt.cu.

805  {
806  *agg = *(reinterpret_cast<const int64_t*>(val));
807 }
__device__ void agg_id_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 838 of file cuda_mapd_rt.cu.

838  {
839  *agg = __float_as_int(val);
840 }
__device__ void agg_id_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 721 of file cuda_mapd_rt.cu.

721  {
722  *agg = val;
723 }
__device__ int8_t* agg_id_varlen_shared ( int8_t *  varlen_buffer,
const int64_t  offset,
const int8_t *  value,
const int64_t  size_bytes 
)

Definition at line 725 of file cuda_mapd_rt.cu.

728  {
729  for (auto i = 0; i < size_bytes; i++) {
730  varlen_buffer[offset + i] = value[i];
731  }
732  return &varlen_buffer[offset];
733 }
__device__ void agg_max_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 515 of file cuda_mapd_rt.cu.

References atomicMax().

515  {
516  atomicMax(reinterpret_cast<double*>(agg), val);
517 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1178 of file cuda_mapd_rt.cu.

References atomicMax().

1180  {
1181  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1182  double old = __longlong_as_double(atomicExch(
1183  reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1184  atomicMax(reinterpret_cast<double*>(agg),
1185  __double_as_longlong(old) == __double_as_longlong(skip_val)
1186  ? val
1187  : fmax(old, val));
1188  }
1189 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 519 of file cuda_mapd_rt.cu.

References atomicMax().

519  {
520  atomicMax(reinterpret_cast<float*>(agg), val);
521 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1072 of file cuda_mapd_rt.cu.

References atomicMax().

1074  {
1075  if (__float_as_int(val) != __float_as_int(skip_val)) {
1076  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1077  atomicMax(reinterpret_cast<float*>(agg),
1078  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1079  }
1080 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 697 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

697  {
698  return atomicMax16(agg, val);
699 }
__device__ void atomicMax16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 901 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

903  {
904  if (val != skip_val) {
905  agg_max_int16_shared(agg, val);
906  }
907 }
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 511 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

511  {
512  atomicMax(agg, val);
513 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 893 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

895  {
896  if (val != skip_val) {
897  agg_max_int32_shared(agg, val);
898  }
899 }
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 701 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

701  {
702  return atomicMax8(agg, val);
703 }
__device__ void atomicMax8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 917 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

919  {
920  if (val != skip_val) {
921  agg_max_int8_shared(agg, val);
922  }
923 }
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_max_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 507 of file cuda_mapd_rt.cu.

References atomicMax64().

507  {
508  atomicMax64(agg, val);
509 }
__device__ int64_t atomicMax64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_max_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 1041 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

1043  {
1044  if (val != skip_val) {
1045  atomicMax64SkipVal(agg, val, skip_val);
1046  }
1047 }
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 713 of file cuda_mapd_rt.cu.

References atomicMin().

713  {
714  atomicMin(reinterpret_cast<double*>(agg), val);
715 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1170 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

1172  {
1173  if (val != skip_val) {
1174  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1175  }
1176 }
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 717 of file cuda_mapd_rt.cu.

References atomicMin().

717  {
718  atomicMin(reinterpret_cast<float*>(agg), val);
719 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1089 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

1091  {
1092  if (__float_as_int(val) != __float_as_int(skip_val)) {
1093  atomicMinFltSkipVal(agg, val, skip_val);
1094  }
1095 }
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 705 of file cuda_mapd_rt.cu.

References atomicMin16().

705  {
706  return atomicMin16(agg, val);
707 }
__device__ void atomicMin16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 909 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

911  {
912  if (val != skip_val) {
913  atomicMin16SkipVal(agg, val, skip_val);
914  }
915 }
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 527 of file cuda_mapd_rt.cu.

References atomicMin().

527  {
528  atomicMin(agg, val);
529 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 940 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

942  {
943  if (val != skip_val) {
944  atomicMin32SkipVal(agg, val, skip_val);
945  }
946 }
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 709 of file cuda_mapd_rt.cu.

References atomicMin8().

709  {
710  return atomicMin8(agg, val);
711 }
__device__ void atomicMin8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 925 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

927  {
928  if (val != skip_val) {
929  atomicMin8SkipVal(agg, val, skip_val);
930  }
931 }
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 523 of file cuda_mapd_rt.cu.

References atomicMin64().

523  {
524  atomicMin64(agg, val);
525 }
__device__ int64_t atomicMin64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_min_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 1016 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

1018  {
1019  if (val != skip_val) {
1020  atomicMin64SkipVal(agg, val, skip_val);
1021  }
1022 }
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 468 of file cuda_mapd_rt.cu.

468  {
469  atomicAdd(reinterpret_cast<double*>(agg), val);
470 }
__device__ void agg_sum_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1131 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

Referenced by agg_sum_if_double_skip_val_shared().

1133  {
1134  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1135  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1136  }
1137 }
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_sum_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 464 of file cuda_mapd_rt.cu.

464  {
465  atomicAdd(reinterpret_cast<float*>(agg), val);
466 }
__device__ void agg_sum_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1104 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

Referenced by agg_sum_if_float_skip_val_shared().

1106  {
1107  if (__float_as_int(val) != __float_as_int(skip_val)) {
1108  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
1109  }
1110 }
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_sum_if_double_shared ( int64_t *  agg,
const double  val,
const int8_t  cond 
)

Definition at line 499 of file cuda_mapd_rt.cu.

501  {
502  if (cond) {
503  atomicAdd(reinterpret_cast<double*>(agg), val);
504  }
505 }
__device__ void agg_sum_if_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val,
const int8_t  cond 
)

Definition at line 1139 of file cuda_mapd_rt.cu.

References agg_sum_double_skip_val_shared().

1142  {
1143  if (cond) {
1144  agg_sum_double_skip_val_shared(agg, val, skip_val);
1145  }
1146 }
__device__ void agg_sum_double_skip_val_shared(int64_t *agg, const double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_if_float_shared ( int32_t *  agg,
const float  val,
const int8_t  cond 
)

Definition at line 491 of file cuda_mapd_rt.cu.

493  {
494  if (cond) {
495  atomicAdd(reinterpret_cast<float*>(agg), val);
496  }
497 }
__device__ void agg_sum_if_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val,
const int8_t  cond 
)

Definition at line 1112 of file cuda_mapd_rt.cu.

References agg_sum_float_skip_val_shared().

1115  {
1116  if (cond) {
1117  agg_sum_float_skip_val_shared(agg, val, skip_val);
1118  }
1119 }
__device__ void agg_sum_float_skip_val_shared(int32_t *agg, const float val, const float skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_if_int32_shared ( int32_t *  agg,
const int32_t  val,
const int8_t  cond 
)

Definition at line 482 of file cuda_mapd_rt.cu.

484  {
485  if (cond) {
486  return atomicAdd(agg, val);
487  }
488  return *agg;
489 }
__device__ int32_t agg_sum_if_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val,
const int8_t  cond 
)

Definition at line 967 of file cuda_mapd_rt.cu.

References agg_sum_int32_skip_val_shared().

970  {
971  return cond ? agg_sum_int32_skip_val_shared(agg, val, skip_val) : *agg;
972 }
__device__ int32_t agg_sum_int32_skip_val_shared(int32_t *agg, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_if_shared ( int64_t *  agg,
const int64_t  val,
const int8_t  cond 
)

Definition at line 472 of file cuda_mapd_rt.cu.

474  {
475  static_assert(sizeof(int64_t) == sizeof(unsigned long long));
476  if (cond) {
477  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
478  }
479  return *agg;
480 }
__device__ int64_t agg_sum_if_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val,
const int8_t  cond 
)

Definition at line 992 of file cuda_mapd_rt.cu.

References agg_sum_skip_val_shared().

995  {
996  return cond ? agg_sum_skip_val_shared(agg, val, skip_val) : *agg;
997 }
__device__ int64_t agg_sum_skip_val_shared(int64_t *agg, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 460 of file cuda_mapd_rt.cu.

460  {
461  return atomicAdd(agg, val);
462 }
__device__ int32_t agg_sum_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 957 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

Referenced by agg_sum_if_int32_skip_val_shared().

959  {
960  if (val != skip_val) {
961  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
962  return old;
963  }
964  return 0;
965 }
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t agg_sum_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 456 of file cuda_mapd_rt.cu.

Referenced by write_back_non_grouped_agg().

456  {
457  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
458 }

+ Here is the caller graph for this function:

__device__ int64_t agg_sum_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 983 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

Referenced by agg_sum_if_skip_val_shared().

985  {
986  if (val != skip_val) {
987  return atomicSum64SkipVal(agg, val, skip_val);
988  }
989  return 0;
990 }
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ double atomicMax ( double *  address,
double  val 
)

Definition at line 372 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_double_skip_val_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

372  {
373  unsigned long long int* address_as_ull = (unsigned long long int*)address;
374  unsigned long long int old = *address_as_ull, assumed;
375 
376  do {
377  assumed = old;
378  old = atomicCAS(address_as_ull,
379  assumed,
380  __double_as_longlong(max(val, __longlong_as_double(assumed))));
381 
382  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
383  } while (assumed != old);
384 
385  return __longlong_as_double(old);
386 }

+ Here is the caller graph for this function:

__device__ float atomicMax ( float *  address,
float  val 
)

Definition at line 388 of file cuda_mapd_rt.cu.

388  {
389  int* address_as_int = (int*)address;
390  int old = *address_as_int, assumed;
391 
392  do {
393  assumed = old;
394  old = atomicCAS(
395  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
396 
397  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
398  } while (assumed != old);
399 
400  return __int_as_float(old);
401 }
__device__ void atomicMax16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 545 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

545  {
546  // properly align the input pointer:
547  unsigned int* base_address_u32 =
548  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
549 
550  unsigned int old_value = *base_address_u32;
551  unsigned int swap_value, compare_value;
552  do {
553  compare_value = old_value;
554  swap_value =
555  (reinterpret_cast<size_t>(agg) & 0x2)
556  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
557  << 16 |
558  (old_value & 0xFFFF)
559  : (old_value & 0xFFFF0000) |
560  static_cast<unsigned int>(
561  max(static_cast<int16_t>(old_value & 0xFFFF), val));
562  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
563  } while (old_value != compare_value);
564 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64 ( int64_t *  address,
int64_t  val 
)

Definition at line 330 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

330  {
331  unsigned long long int* address_as_ull = (unsigned long long int*)address;
332  unsigned long long int old = *address_as_ull, assumed;
333 
334  do {
335  assumed = old;
336  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
337  } while (assumed != old);
338 
339  return old;
340 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 1024 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

1026  {
1027  unsigned long long int* address_as_ull =
1028  reinterpret_cast<unsigned long long int*>(address);
1029  unsigned long long int old = *address_as_ull, assumed;
1030 
1031  do {
1032  assumed = old;
1033  old = atomicCAS(address_as_ull,
1034  assumed,
1035  assumed == skip_val ? val : max((long long)val, (long long)assumed));
1036  } while (assumed != old);
1037 
1038  return old;
1039 }

+ Here is the caller graph for this function:

__device__ void atomicMax8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 567 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

567  {
568  // properly align the input pointer:
569  unsigned int* base_address_u32 =
570  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
571 
572  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
573  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
574  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
575  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
576  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
577  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
578  unsigned int old_value = *base_address_u32;
579  unsigned int swap_value, compare_value;
580  do {
581  compare_value = old_value;
582  auto max_value = static_cast<unsigned int>(
583  // compare val with its corresponding bits in the compare_value
584  max(val,
585  static_cast<int8_t>(__byte_perm(
586  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
587  swap_value = __byte_perm(
588  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
589  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
590  } while (compare_value != old_value);
591 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( double *  address,
double  val 
)

Definition at line 403 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

403  {
404  unsigned long long int* address_as_ull = (unsigned long long int*)address;
405  unsigned long long int old = *address_as_ull, assumed;
406 
407  do {
408  assumed = old;
409  old = atomicCAS(address_as_ull,
410  assumed,
411  __double_as_longlong(min(val, __longlong_as_double(assumed))));
412  } while (assumed != old);
413 
414  return __longlong_as_double(old);
415 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( float *  address,
float  val 
)

Definition at line 417 of file cuda_mapd_rt.cu.

417  {
418  int* address_as_ull = (int*)address;
419  int old = *address_as_ull, assumed;
420 
421  do {
422  assumed = old;
423  old = atomicCAS(
424  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
425  } while (assumed != old);
426 
427  return __int_as_float(old);
428 }
__device__ void atomicMin16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 607 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

607  {
608  // properly align the input pointer:
609  unsigned int* base_address_u32 =
610  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
611 
612  unsigned int old_value = *base_address_u32;
613  unsigned int swap_value, compare_value;
614  do {
615  compare_value = old_value;
616  swap_value =
617  (reinterpret_cast<size_t>(agg) & 0x2)
618  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
619  << 16 |
620  (old_value & 0xFFFF)
621  : (old_value & 0xFFFF0000) |
622  static_cast<unsigned int>(
623  min(static_cast<int16_t>(old_value & 0xFFFF), val));
624  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
625  } while (old_value != compare_value);
626 }

+ Here is the caller graph for this function:

__device__ void atomicMin16SkipVal ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 629 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

631  {
632  // properly align the input pointer:
633  unsigned int* base_address_u32 =
634  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
635 
636  unsigned int old_value = *base_address_u32;
637  unsigned int swap_value, compare_value;
638  do {
639  compare_value = old_value;
640  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
641  ? static_cast<int16_t>(old_value >> 16)
642  : static_cast<int16_t>(old_value & 0xFFFF);
643 
644  swap_value =
645  (reinterpret_cast<size_t>(agg) & 0x2)
646  ? static_cast<unsigned int>(
647  selected_old_val == skip_val ? val : min(selected_old_val, val))
648  << 16 |
649  (old_value & 0xFFFF)
650  : (old_value & 0xFFFF0000) |
651  static_cast<unsigned int>(
652  selected_old_val == skip_val ? val : min(selected_old_val, val));
653  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
654  } while (old_value != compare_value);
655 }

+ Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal ( int32_t *  address,
int32_t  val,
const int32_t  skip_val 
)

Definition at line 933 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

935  {
936  int32_t old = atomicExch(address, INT_MAX);
937  return atomicMin(address, old == skip_val ? val : min(old, val));
938 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64 ( int64_t *  address,
int64_t  val 
)

Definition at line 342 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

342  {
343  unsigned long long int* address_as_ull = (unsigned long long int*)address;
344  unsigned long long int old = *address_as_ull, assumed;
345 
346  do {
347  assumed = old;
348  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
349  } while (assumed != old);
350 
351  return old;
352 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 999 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

1001  {
1002  unsigned long long int* address_as_ull =
1003  reinterpret_cast<unsigned long long int*>(address);
1004  unsigned long long int old = *address_as_ull, assumed;
1005 
1006  do {
1007  assumed = old;
1008  old = atomicCAS(address_as_ull,
1009  assumed,
1010  assumed == skip_val ? val : min((long long)val, (long long)assumed));
1011  } while (assumed != old);
1012 
1013  return old;
1014 }

+ Here is the caller graph for this function:

__device__ void atomicMin8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 657 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

657  {
658  // properly align the input pointer:
659  unsigned int* base_address_u32 =
660  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
661 
662  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
663  unsigned int old_value = *base_address_u32;
664  unsigned int swap_value, compare_value;
665  do {
666  compare_value = old_value;
667  auto min_value = static_cast<unsigned int>(
668  min(val,
669  static_cast<int8_t>(__byte_perm(
670  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
671  swap_value = __byte_perm(
672  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
673  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
674  } while (compare_value != old_value);
675 }

+ Here is the caller graph for this function:

__device__ void atomicMin8SkipVal ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 677 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

677  {
678  // properly align the input pointer:
679  unsigned int* base_address_u32 =
680  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
681 
682  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
683  unsigned int old_value = *base_address_u32;
684  unsigned int swap_value, compare_value;
685  do {
686  compare_value = old_value;
687  int8_t selected_old_val = static_cast<int8_t>(
688  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
689  auto min_value = static_cast<unsigned int>(
690  selected_old_val == skip_val ? val : min(val, selected_old_val));
691  swap_value = __byte_perm(
692  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
693  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
694  } while (compare_value != old_value);
695 }

+ Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1148 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

1150  {
1151  unsigned long long int* address_as_ull =
1152  reinterpret_cast<unsigned long long int*>(address);
1153  unsigned long long int old = *address_as_ull;
1154  unsigned long long int skip_val_as_ull =
1155  *reinterpret_cast<const unsigned long long*>(&skip_val);
1156  unsigned long long int assumed;
1157 
1158  do {
1159  assumed = old;
1160  old = atomicCAS(address_as_ull,
1161  assumed,
1162  assumed == skip_val_as_ull
1163  ? *reinterpret_cast<unsigned long long*>(&val)
1164  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1165  } while (assumed != old);
1166 
1167  return __longlong_as_double(old);
1168 }

+ Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal ( int32_t *  address,
float  val,
const float  skip_val 
)

Definition at line 1082 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

1082  {
1083  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1084  return atomicMin(
1085  reinterpret_cast<float*>(address),
1086  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1087 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal ( int32_t *  address,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 948 of file cuda_mapd_rt.cu.

Referenced by agg_sum_int32_skip_val_shared().

950  {
951  unsigned int* address_as_int = (unsigned int*)address;
952  int32_t old = atomicExch(address_as_int, 0);
953  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
954  return old == skip_val ? old2 : (old2 + old);
955 }

+ Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal ( int64_t *  address,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 974 of file cuda_mapd_rt.cu.

Referenced by agg_sum_skip_val_shared().

976  {
977  unsigned long long int* address_as_ull = (unsigned long long int*)address;
978  int64_t old = atomicExch(address_as_ull, 0);
979  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
980  return old == skip_val ? old2 : (old2 + old);
981 }

+ Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal ( double *  address,
const double  val,
const double  skip_val 
)

Definition at line 1121 of file cuda_mapd_rt.cu.

Referenced by agg_sum_double_skip_val_shared().

1123  {
1124  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1125  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1126  atomicAdd(
1127  address,
1128  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1129 }

+ Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal ( float *  address,
const float  val,
const float  skip_val 
)

Definition at line 1097 of file cuda_mapd_rt.cu.

References f().

Referenced by agg_sum_float_skip_val_shared().

1099  {
1100  float old = atomicExch(address, 0.f);
1101  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1102 }
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ bool check_interrupt ( )

Definition at line 159 of file cuda_mapd_rt.cu.

Referenced by check_interrupt_rt(), and ColumnFetcher::linearizeFixedLenArrayColFrags().

159  {
160  return (runtime_interrupt_flag == 1) ? true : false;
161 }
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95

+ Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared ( int64_t *  agg,
const double  val,
const double  null_val 
)

Definition at line 778 of file cuda_mapd_rt.cu.

780  {
781  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
782  unsigned long long int old = *address_as_ull, assumed;
783 
784  if (val == null_val) {
785  return 0;
786  }
787 
788  do {
789  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
790  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
791  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
792  return 15;
793  } else {
794  break;
795  }
796  }
797 
798  assumed = old;
799  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
800  } while (assumed != old);
801 
802  return 0;
803 }
__device__ int32_t checked_single_agg_id_double_shared_slow ( int64_t *  agg,
const double *  valp,
const double  null_val 
)

Definition at line 810 of file cuda_mapd_rt.cu.

812  {
813  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
814  unsigned long long int old = *address_as_ull, assumed;
815  double val = *valp;
816 
817  if (val == null_val) {
818  return 0;
819  }
820 
821  do {
822  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
823  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
824  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
825  return 15;
826  } else {
827  break;
828  }
829  }
830 
831  assumed = old;
832  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
833  } while (assumed != old);
834 
835  return 0;
836 }
__device__ int32_t checked_single_agg_id_float_shared ( int32_t *  agg,
const float  val,
const float  null_val 
)

Definition at line 842 of file cuda_mapd_rt.cu.

844  {
845  int* address_as_ull = reinterpret_cast<int*>(agg);
846  int old = *address_as_ull, assumed;
847 
848  if (val == null_val) {
849  return 0;
850  }
851 
852  do {
853  if (old != __float_as_int(null_val)) {
854  if (old != __float_as_int(val)) {
855  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
856  return 15;
857  } else {
858  break;
859  }
860  }
861 
862  assumed = old;
863  old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
864  } while (assumed != old);
865 
866  return 0;
867 }
__device__ int32_t checked_single_agg_id_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  null_val 
)

Definition at line 735 of file cuda_mapd_rt.cu.

737  {
738  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
739  unsigned long long int old = *address_as_ull, assumed;
740 
741  if (val == null_val) {
742  return 0;
743  }
744 
745  do {
746  if (static_cast<int64_t>(old) != null_val) {
747  if (static_cast<int64_t>(old) != val) {
748  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
749  return 15;
750  } else {
751  break;
752  }
753  }
754 
755  assumed = old;
756  old = atomicCAS(address_as_ull, assumed, val);
757  } while (assumed != old);
758 
759  return 0;
760 }
__device__ int64_t* declare_dynamic_shared_memory ( )

Definition at line 56 of file cuda_mapd_rt.cu.

56  {
57  extern __shared__ int64_t shared_mem_buffer[];
58  return shared_mem_buffer;
59 }
__device__ bool dynamic_watchdog ( )

Definition at line 115 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), anonymous_namespace{ResultSetReduction.cpp}::check_watchdog_with_seed(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

115  {
116  // check for dynamic watchdog, if triggered all threads return true
117  if (dw_cycle_budget == 0LL) {
118  return false; // Uninitialized watchdog can't check time
119  }
120  if (dw_abort == 1) {
121  return true; // Received host request to abort
122  }
123  uint32_t smid = get_smid();
124  if (smid >= 128) {
125  return false;
126  }
127  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
128  __shared__ volatile bool
129  dw_should_terminate; // all threads within a block should return together if
130  // watchdog criteria is met
131 
132  // thread 0 either initializes or read the initial clock cycle, the result is stored
133  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
134  // point in using more threads here.
135  if (threadIdx.x == 0) {
136  dw_block_cycle_start = 0LL;
137  int64_t cycle_count = static_cast<int64_t>(clock64());
138  // Make sure the block hasn't switched SMs
139  if (smid == get_smid()) {
140  dw_block_cycle_start = static_cast<int64_t>(
141  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
142  0ULL,
143  static_cast<unsigned long long>(cycle_count)));
144  }
145 
146  int64_t cycles = cycle_count - dw_block_cycle_start;
147  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
148  (cycles > dw_cycle_budget)) {
149  // Check if we're out of time on this particular SM
150  dw_should_terminate = true;
151  } else {
152  dw_should_terminate = false;
153  }
154  }
155  __syncthreads();
156  return dw_should_terminate;
157 }
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
__inline__ __device__ uint32_t get_smid(void)
Definition: cuda_mapd_rt.cu:97
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94

+ Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1360 of file cuda_mapd_rt.cu.

1360  {
1361  __threadfence_block();
1362 }
__device__ int64_t get_block_index ( )

Definition at line 23 of file cuda_mapd_rt.cu.

23  {
24  return blockIdx.x;
25 }
template<typename T = unsigned long long>
__device__ T get_empty_key ( )
inline

Definition at line 164 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

164  {
165  return EMPTY_KEY_64;
166 }
#define EMPTY_KEY_64
template<>
__device__ unsigned int get_empty_key ( )
inline

Definition at line 169 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

169  {
170  return EMPTY_KEY_32;
171 }
#define EMPTY_KEY_32
template<typename T >
__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const T *  key,
const uint32_t  key_count,
const uint32_t  row_size_quad 
)
inline

Definition at line 174 of file cuda_mapd_rt.cu.

References align_to_int64(), and heavydb.dtypes::T.

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

178  {
179  const T empty_key = get_empty_key<T>();
180  uint32_t off = h * row_size_quad;
181  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
182  {
183  const T old = atomicCAS(row_ptr, empty_key, *key);
184  if (empty_key == old && key_count > 1) {
185  for (size_t i = 1; i <= key_count - 1; ++i) {
186  atomicExch(row_ptr + i, key[i]);
187  }
188  }
189  }
190  if (key_count > 1) {
191  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
192  // spin until the winning thread has finished writing the entire key and the init
193  // value
194  }
195  }
196  bool match = true;
197  for (uint32_t i = 0; i < key_count; ++i) {
198  if (row_ptr[i] != key[i]) {
199  match = false;
200  break;
201  }
202  }
203 
204  if (match) {
205  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
206  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
207  }
208  return NULL;
209 }
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad 
)

Definition at line 211 of file cuda_mapd_rt.cu.

References get_matching_group_value().

216  {
217  switch (key_width) {
218  case 4:
219  return get_matching_group_value(groups_buffer,
220  h,
221  reinterpret_cast<const unsigned int*>(key),
222  key_count,
223  row_size_quad);
224  case 8:
225  return get_matching_group_value(groups_buffer,
226  h,
227  reinterpret_cast<const unsigned long long*>(key),
228  key_count,
229  row_size_quad);
230  default:
231  return NULL;
232  }
233 }
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)

+ Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_qw_count,
const size_t  entry_count 
)

Definition at line 296 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

301  {
302  uint32_t off = h;
303  {
304  const uint64_t old = atomicCAS(
305  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
306  if (EMPTY_KEY_64 == old) {
307  for (size_t i = 0; i < key_qw_count; ++i) {
308  groups_buffer[off] = key[i];
309  off += entry_count;
310  }
311  return &groups_buffer[off];
312  }
313  }
314  __syncthreads();
315  off = h;
316  for (size_t i = 0; i < key_qw_count; ++i) {
317  if (groups_buffer[off] != key[i]) {
318  return NULL;
319  }
320  off += entry_count;
321  }
322  return &groups_buffer[off];
323 }
#define EMPTY_KEY_64

+ Here is the caller graph for this function:

template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const T *  key,
const uint32_t  key_count 
)

Definition at line 236 of file cuda_mapd_rt.cu.

References heavydb.dtypes::T.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

240  {
241  const T empty_key = get_empty_key<T>();
242  const uint64_t old =
243  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
244  // the winner thread proceeds with writing the rest fo the keys
245  if (old == empty_key) {
246  uint32_t offset = h + entry_count;
247  for (size_t i = 1; i < key_count; ++i) {
248  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
249  offset += entry_count;
250  }
251  }
252 
253  __threadfence();
254  // for all threads except the winning thread, memory content of the keys
255  // related to the hash offset are checked again. In case of a complete match
256  // the hash offset is returned, otherwise -1 is returned
257  if (old != empty_key) {
258  uint32_t offset = h;
259  for (uint32_t i = 0; i < key_count; ++i) {
260  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
261  return -1;
262  }
263  offset += entry_count;
264  }
265  }
266  return h;
267 }

+ Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width 
)

Definition at line 270 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

275  {
276  switch (key_width) {
277  case 4:
279  groups_buffer,
280  entry_count,
281  h,
282  reinterpret_cast<const unsigned int*>(key),
283  key_count);
284  case 8:
286  groups_buffer,
287  entry_count,
288  h,
289  reinterpret_cast<const unsigned long long*>(key),
290  key_count);
291  default:
292  return -1;
293  }
294 }
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)

+ Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void  )

Definition at line 97 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

97  {
98  uint32_t ret;
99  asm("mov.u32 %0, %%smid;" : "=r"(ret));
100  return ret;
101 }

+ Here is the caller graph for this function:

__device__ int64_t get_thread_index ( )

Definition at line 19 of file cuda_mapd_rt.cu.

19  {
20  return threadIdx.x;
21 }
__device__ int32_t group_buff_idx_impl ( )

Definition at line 31 of file cuda_mapd_rt.cu.

References pos_start_impl().

31  {
32  return pos_start_impl(NULL);
33 }
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:27

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem ( const int64_t *  global_groups_buffer,
const int32_t  groups_buffer_size 
)

Initializes the shared memory buffer for perfect hash group by. In this function, we simply copy the global group by buffer (already initialized on the host and transferred) to all shared memory group by buffers.

Definition at line 66 of file cuda_mapd_rt.cu.

67  {
68  // dynamic shared memory declaration
69  extern __shared__ int64_t shared_groups_buffer[];
70 
71  // it is assumed that buffer size is aligned with 64-bit units
72  // so it is safe to assign 64-bit to each thread
73  const int32_t buffer_units = groups_buffer_size >> 3;
74 
75  for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
76  shared_groups_buffer[pos] = global_groups_buffer[pos];
77  }
78  __syncthreads();
79  return shared_groups_buffer;
80 }
__device__ const int64_t* init_shared_mem_nop ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 43 of file cuda_mapd_rt.cu.

45  {
46  return groups_buffer;
47 }
__device__ void linear_probabilistic_count ( uint8_t *  bitmap,
const uint32_t  bitmap_bytes,
const uint8_t *  key_bytes,
const uint32_t  key_len 
)

Definition at line 1293 of file cuda_mapd_rt.cu.

1296  {
1297  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1298  const uint32_t word_idx = bit_pos / 32;
1299  const uint32_t bit_idx = bit_pos % 32;
1300  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1301 }
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
__device__ int32_t pos_start_impl ( const int32_t *  row_index_resume)

Definition at line 27 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), get_error_code(), group_buff_idx_impl(), and record_error_code().

27  {
28  return blockIdx.x * blockDim.x + threadIdx.x;
29 }

+ Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 35 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

35  {
36  return blockDim.x * gridDim.x;
37 }

+ Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS ( int64_t *  slot,
int64_t  new_val,
int64_t  init_val 
)

Definition at line 1193 of file cuda_mapd_rt.cu.

1195  {
1196  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1197  const auto empty_key =
1198  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1199  const auto new_val_cast =
1200  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1201 
1202  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1203  if (old_val == *empty_key) {
1204  return true;
1205  } else {
1206  return false;
1207  }
1208 }
__device__ bool slotEmptyKeyCAS_int16 ( int16_t *  slot,
int16_t  new_val,
int16_t  init_val 
)

Definition at line 1221 of file cuda_mapd_rt.cu.

1223  {
1224  unsigned int* base_slot_address =
1225  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1226  unsigned int old_value = *base_slot_address;
1227  unsigned int swap_value, compare_value;
1228  do {
1229  compare_value = old_value;
1230  // exit criteria: if init_val does not exist in the slot (some other thread has
1231  // succeeded)
1232  if (static_cast<unsigned int>(init_val) !=
1233  __byte_perm(
1234  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1235  return false;
1236  }
1237  swap_value = __byte_perm(compare_value,
1238  static_cast<unsigned int>(new_val),
1239  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1240  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1241  } while (compare_value != old_value);
1242  return true;
1243 }
__device__ bool slotEmptyKeyCAS_int32 ( int32_t *  slot,
int32_t  new_val,
int32_t  init_val 
)

Definition at line 1210 of file cuda_mapd_rt.cu.

1212  {
1213  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1214  unsigned int compare_value = static_cast<unsigned int>(init_val);
1215  unsigned int swap_value = static_cast<unsigned int>(new_val);
1216 
1217  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1218  return old_value == compare_value;
1219 }
__device__ bool slotEmptyKeyCAS_int8 ( int8_t *  slot,
int8_t  new_val,
int8_t  init_val 
)

Definition at line 1245 of file cuda_mapd_rt.cu.

1247  {
1248  // properly align the slot address:
1249  unsigned int* base_slot_address =
1250  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1251  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1252  unsigned int old_value = *base_slot_address;
1253  unsigned int swap_value, compare_value;
1254  do {
1255  compare_value = old_value;
1256  // exit criteria: if init_val does not exist in the slot (some other thread has
1257  // succeeded)
1258  if (static_cast<unsigned int>(init_val) !=
1259  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1260  return false;
1261  }
1262  swap_value = __byte_perm(compare_value,
1263  static_cast<unsigned int>(new_val),
1264  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1265  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1266  } while (compare_value != old_value);
1267  return true;
1268 }
__device__ StringView string_decode ( int8_t *  chunk_iter_,
int64_t  pos 
)

Definition at line 1282 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

1282  {
1283  // TODO(alex): de-dup, the x64 version is basically identical
1284  auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1285  VarlenDatum vd;
1286  bool is_end;
1287  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1288  // CHECK(!is_end); <--- this is the difference (re: above comment)
1289  return vd.is_null ? StringView{nullptr, 0u}
1290  : StringView{reinterpret_cast<char const*>(vd.pointer), vd.length};
1291 }
bool is_null
Definition: Datum.h:57
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:182
int8_t * pointer
Definition: Datum.h:56
size_t length
Definition: Datum.h:55

+ Here is the call graph for this function:

__device__ void sync_threadblock ( )

Definition at line 1383 of file cuda_mapd_rt.cu.

Referenced by GpuSharedMemCodeBuilder::codegenInitialization(), and GpuSharedMemCodeBuilder::codegenReduction().

1383  {
1384  __syncthreads();
1385 }

+ Here is the caller graph for this function:

__device__ void sync_warp ( )

Definition at line 1364 of file cuda_mapd_rt.cu.

1364  {
1365  __syncwarp();
1366 }
__device__ void sync_warp_protected ( int64_t  thread_pos,
int64_t  row_count 
)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1375 of file cuda_mapd_rt.cu.

1375  {
1376  // only syncing if NOT within the same warp as those threads experiencing the critical
1377  // edge
1378  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1379  __syncwarp();
1380  }
1381 }
__device__ int8_t thread_warp_idx ( const int8_t  warp_sz)

Definition at line 39 of file cuda_mapd_rt.cu.

39  {
40  return threadIdx.x % warp_sz;
41 }
__device__ void write_back_non_grouped_agg ( int64_t *  input_buffer,
int64_t *  output_buffer,
const int32_t  agg_idx 
)

Definition at line 1395 of file cuda_mapd_rt.cu.

References agg_sum_shared().

1397  {
1398  if (threadIdx.x == agg_idx) {
1399  agg_sum_shared(output_buffer, input_buffer[agg_idx]);
1400  }
1401 }
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ void write_back_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 49 of file cuda_mapd_rt.cu.

49  {
50 }

Variable Documentation

__device__ int32_t dw_abort = 0

Definition at line 94 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), dynamic_watchdog_init(), and Executor::interrupt().

__device__ int64_t dw_cycle_budget = 0

Definition at line 93 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), and dynamic_watchdog_init().

__device__ int64_t dw_sm_cycle_start[128]

Definition at line 91 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

__device__ int32_t runtime_interrupt_flag = 0

Definition at line 95 of file cuda_mapd_rt.cu.

Referenced by check_interrupt(), check_interrupt_init(), and Executor::interrupt().