OmniSciDB  addbbd5075
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
cuda_mapd_rt.cu File Reference
#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "TableFunctions/TableFunctions.hpp"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include <stdio.h>
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"
+ Include dependency graph for cuda_mapd_rt.cu:

Go to the source code of this file.

Macros

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu
 
#define DEF_AGG_ID_INT_SHARED(n)
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   int64_t
 
#define ADDR_T   uint64_t
 
#define DATA_T   int32_t
 
#define ADDR_T   uint32_t
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   double
 
#define ADDR_T   uint64_t
 
#define DATA_T   float
 
#define ADDR_T   uint32_t
 
#define EXECUTE_INCLUDE
 

Functions

__device__ int32_t pos_start_impl (const int32_t *row_index_resume)
 
__device__ int32_t group_buff_idx_impl ()
 
__device__ int32_t pos_step_impl ()
 
__device__ int8_t thread_warp_idx (const int8_t warp_sz)
 
__device__ const int64_t * init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ const int64_t * init_shared_mem (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ int64_t * alloc_shared_mem_dynamic ()
 
__device__ void set_shared_mem_to_identity (int64_t *groups_buffer_smem, const int32_t groups_buffer_size, const int64_t identity_element=0)
 
__device__ const int64_t * init_shared_mem_dynamic (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ void write_back_smem_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ void agg_from_smem_to_gmem_nop (int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
 
__device__ void agg_from_smem_to_gmem_binId_count (int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
 
__device__ void agg_from_smem_to_gmem_count_binId (int64_t *gmem_dest, int64_t *smem_src, const int32_t num_elements)
 
__inline__ __device__ uint32_t get_smid (void)
 
__device__ bool dynamic_watchdog ()
 
template<typename T = unsigned long long>
__device__ T get_empty_key ()
 
template<>
__device__ unsigned int get_empty_key ()
 
template<typename T >
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
 
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const int64_t *init_vals)
 
template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
 
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width)
 
__device__ int64_t * get_matching_group_value_columnar (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
 
__device__ int64_t atomicMax64 (int64_t *address, int64_t val)
 
__device__ int64_t atomicMin64 (int64_t *address, int64_t val)
 
__device__ double atomicAdd (double *address, double val)
 
__device__ double atomicMax (double *address, double val)
 
__device__ float atomicMax (float *address, float val)
 
__device__ double atomicMin (double *address, double val)
 
__device__ double atomicMin (float *address, float val)
 
__device__ uint64_t agg_count_shared (uint64_t *agg, const int64_t val)
 
__device__ uint32_t agg_count_int32_shared (uint32_t *agg, const int32_t val)
 
__device__ uint64_t agg_count_double_shared (uint64_t *agg, const double val)
 
__device__ uint32_t agg_count_float_shared (uint32_t *agg, const float val)
 
__device__ int64_t agg_sum_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t agg_sum_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_sum_float_shared (int32_t *agg, const float val)
 
__device__ void agg_sum_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_max_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_max_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_float_shared (int32_t *agg, const float val)
 
__device__ void agg_min_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_min_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void atomicMax16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMax8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void atomicMin8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_max_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_max_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_min_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_double_shared (int64_t *agg, const double val)
 
__device__ void agg_min_float_shared (int32_t *agg, const float val)
 
__device__ void agg_id_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)
 
__device__ void agg_id_double_shared (int64_t *agg, const double val)
 
__device__ int32_t checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)
 
__device__ void agg_id_double_shared_slow (int64_t *agg, const double *val)
 
__device__ int32_t checked_single_agg_id_double_shared_slow (int64_t *agg, const double *valp, const double null_val)
 
__device__ void agg_id_float_shared (int32_t *agg, const float val)
 
__device__ int32_t checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)
 
__device__ void agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ void agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ int32_t atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)
 
__device__ void agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int64_t atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ void agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ float atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)
 
__device__ void agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumFltSkipVal (float *address, const float val, const float skip_val)
 
__device__ void agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumDblSkipVal (double *address, const double val, const double skip_val)
 
__device__ void agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ double atomicMinDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ double atomicMaxDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ bool slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)
 
__device__ bool slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)
 
__device__ bool slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)
 
__device__ bool slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)
 
__device__ uint64_t string_decode (int8_t *chunk_iter_, int64_t pos)
 
__device__ void linear_probabilistic_count (uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
 
__device__ void agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
 
__device__ void force_sync ()
 
__device__ void sync_warp ()
 
__device__ void sync_warp_protected (int64_t thread_pos, int64_t row_count)
 

Variables

__device__ int64_t dw_sm_cycle_start [128]
 
__device__ int64_t dw_cycle_budget = 0
 
__device__ int32_t dw_abort = 0
 

Macro Definition Documentation

#define ADDR_T   uint64_t

Definition at line 1033 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1033 of file cuda_mapd_rt.cu.

#define ADDR_T   uint64_t

Definition at line 1033 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1033 of file cuda_mapd_rt.cu.

#define DATA_T   int64_t

Definition at line 1032 of file cuda_mapd_rt.cu.

#define DATA_T   int32_t

Definition at line 1032 of file cuda_mapd_rt.cu.

#define DATA_T   double

Definition at line 1032 of file cuda_mapd_rt.cu.

#define DATA_T   float

Definition at line 1032 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED (   n)
Value:
extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}

Definition at line 745 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return 0; \
}
#define DATA_T
#define ADDR_T

Definition at line 1017 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return *agg; \
}
#define DATA_T
#define ADDR_T

Definition at line 1017 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1237 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu

Definition at line 149 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu ( int64_t *  agg,
const int64_t  key,
const uint32_t  b,
const int64_t  base_dev_addr,
const int64_t  base_host_addr 
)

Definition at line 1313 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

1318  {
1319  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1320  const uint32_t index = hash >> (64 - b);
1321  const int32_t rank = get_rank(hash << b, 64 - b);
1322  const int64_t host_addr = *agg;
1323  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1324  atomicMax(&M[index], rank);
1325 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:26
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1266 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

1272  {
1273  const uint64_t bitmap_idx = val - min_val;
1274  const uint32_t byte_idx = bitmap_idx >> 3;
1275  const uint32_t word_idx = byte_idx >> 2;
1276  const uint32_t byte_word_idx = byte_idx & 3;
1277  const int64_t host_addr = *agg;
1278  uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1279  (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1280  switch (byte_word_idx) {
1281  case 0:
1282  atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1283  break;
1284  case 1:
1285  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1286  break;
1287  case 2:
1288  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1289  break;
1290  case 3:
1291  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1292  break;
1293  default:
1294  break;
1295  }
1296 }

+ Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  skip_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1298 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

1306  {
1307  if (val != skip_val) {
1309  agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1310  }
1311 }
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

+ Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared ( uint64_t *  agg,
const double  val 
)

Definition at line 503 of file cuda_mapd_rt.cu.

References agg_count_shared().

503  {
504  return agg_count_shared(agg, val);
505 }
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared ( uint32_t *  agg,
const float  val 
)

Definition at line 507 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

507  {
508  return agg_count_int32_shared(agg, val);
509 }
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_int32_shared ( uint32_t *  agg,
const int32_t  val 
)

Definition at line 499 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_count_float_shared().

499  {
500  return atomicAdd(agg, 1UL);
501 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ uint64_t agg_count_shared ( uint64_t *  agg,
const int64_t  val 
)

Definition at line 495 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_count_double_shared().

495  {
496  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1UL));
497 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_from_smem_to_gmem_binId_count ( int64_t *  gmem_dest,
int64_t *  smem_src,
const int32_t  num_elements 
)

Aggregate the result stored into shared memory back into global memory. It also writes back the stored binId, if any, back into global memory. Memory layout assumption: each 64-bit shared memory unit of data is as follows: [0..31: the stored bin ID, to be written back][32..63: the count result, to be aggregated]

Definition at line 107 of file cuda_mapd_rt.cu.

References atomicAdd().

109  {
110  __syncthreads();
111 #pragma unroll
112  for (int i = threadIdx.x; i < num_elements; i += blockDim.x) {
113  int32_t bin_id = *reinterpret_cast<int32_t*>(smem_src + i);
114  int32_t count_result = *(reinterpret_cast<int32_t*>(smem_src + i) + 1);
115  if (count_result) { // non-zero count
116  atomicAdd(reinterpret_cast<unsigned int*>(gmem_dest + i) + 1,
117  static_cast<int32_t>(count_result));
118  // writing back the binId, only if count_result is non-zero
119  *reinterpret_cast<unsigned int*>(gmem_dest + i) = static_cast<int32_t>(bin_id);
120  }
121  }
122 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_from_smem_to_gmem_count_binId ( int64_t *  gmem_dest,
int64_t *  smem_src,
const int32_t  num_elements 
)

Aggregate the result stored into shared memory back into global memory. It also writes back the stored binId, if any, back into global memory. Memory layout assumption: each 64-bit shared memory unit of data is as follows: [0..31: the count result, to be aggregated][32..63: the stored bin ID, to be written back]

Definition at line 131 of file cuda_mapd_rt.cu.

References atomicAdd().

133  {
134  __syncthreads();
135 #pragma unroll
136  for (int i = threadIdx.x; i < num_elements; i += blockDim.x) {
137  int32_t count_result = *reinterpret_cast<int32_t*>(smem_src + i);
138  int32_t bin_id = *(reinterpret_cast<int32_t*>(smem_src + i) + 1);
139  if (count_result) { // non-zero count
140  atomicAdd(reinterpret_cast<unsigned int*>(gmem_dest + i),
141  static_cast<int32_t>(count_result));
142  // writing back the binId, only if count_result is non-zero
143  *(reinterpret_cast<unsigned int*>(gmem_dest + i) + 1) =
144  static_cast<int32_t>(bin_id);
145  }
146  }
147 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_from_smem_to_gmem_nop ( int64_t *  gmem_dest,
int64_t *  smem_src,
const int32_t  num_elements 
)

Definition at line 96 of file cuda_mapd_rt.cu.

98  {}
__device__ void agg_id_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 757 of file cuda_mapd_rt.cu.

757  {
758  *agg = *(reinterpret_cast<const int64_t*>(&val));
759 }
__device__ void agg_id_double_shared_slow ( int64_t *  agg,
const double *  val 
)

Definition at line 788 of file cuda_mapd_rt.cu.

788  {
789  *agg = *(reinterpret_cast<const int64_t*>(val));
790 }
__device__ void agg_id_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 821 of file cuda_mapd_rt.cu.

821  {
822  *agg = __float_as_int(val);
823 }
__device__ void agg_id_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 714 of file cuda_mapd_rt.cu.

714  {
715  *agg = val;
716 }
__device__ void agg_max_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 535 of file cuda_mapd_rt.cu.

References atomicMax().

535  {
536  atomicMax(reinterpret_cast<double*>(agg), val);
537 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1147 of file cuda_mapd_rt.cu.

References atomicMaxDblSkipVal().

1149  {
1150  if (val != skip_val) {
1151  atomicMaxDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1152  }
1153 }
__device__ double atomicMaxDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_max_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 539 of file cuda_mapd_rt.cu.

References atomicMax().

539  {
540  atomicMax(reinterpret_cast<float*>(agg), val);
541 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1039 of file cuda_mapd_rt.cu.

References atomicMax().

1041  {
1042  if (__float_as_int(val) != __float_as_int(skip_val)) {
1043  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1044  atomicMax(reinterpret_cast<float*>(agg),
1045  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1046  }
1047 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 690 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

690  {
691  return atomicMax16(agg, val);
692 }
__device__ void atomicMax16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 882 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

884  {
885  if (val != skip_val) {
886  agg_max_int16_shared(agg, val);
887  }
888 }
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 531 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

531  {
532  atomicMax(agg, val);
533 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 874 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

876  {
877  if (val != skip_val) {
878  agg_max_int32_shared(agg, val);
879  }
880 }
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 694 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

694  {
695  return atomicMax8(agg, val);
696 }
__device__ void atomicMax8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 898 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

900  {
901  if (val != skip_val) {
902  agg_max_int8_shared(agg, val);
903  }
904 }
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_max_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 527 of file cuda_mapd_rt.cu.

References atomicMax64().

527  {
528  atomicMax64(agg, val);
529 }
__device__ int64_t atomicMax64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_max_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 1008 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

1010  {
1011  if (val != skip_val) {
1012  atomicMax64SkipVal(agg, val, skip_val);
1013  }
1014 }
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 706 of file cuda_mapd_rt.cu.

References atomicMin().

706  {
707  atomicMin(reinterpret_cast<double*>(agg), val);
708 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1119 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

1121  {
1122  if (val != skip_val) {
1123  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1124  }
1125 }
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 710 of file cuda_mapd_rt.cu.

References atomicMin().

710  {
711  atomicMin(reinterpret_cast<float*>(agg), val);
712 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1056 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

1058  {
1059  if (__float_as_int(val) != __float_as_int(skip_val)) {
1060  atomicMinFltSkipVal(agg, val, skip_val);
1061  }
1062 }
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 698 of file cuda_mapd_rt.cu.

References atomicMin16().

698  {
699  return atomicMin16(agg, val);
700 }
__device__ void atomicMin16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 890 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

892  {
893  if (val != skip_val) {
894  atomicMin16SkipVal(agg, val, skip_val);
895  }
896 }
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 547 of file cuda_mapd_rt.cu.

References atomicMin().

547  {
548  atomicMin(agg, val);
549 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 921 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

923  {
924  if (val != skip_val) {
925  atomicMin32SkipVal(agg, val, skip_val);
926  }
927 }
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 702 of file cuda_mapd_rt.cu.

References atomicMin8().

702  {
703  return atomicMin8(agg, val);
704 }
__device__ void atomicMin8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 906 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

908  {
909  if (val != skip_val) {
910  atomicMin8SkipVal(agg, val, skip_val);
911  }
912 }
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 543 of file cuda_mapd_rt.cu.

References atomicMin64().

543  {
544  atomicMin64(agg, val);
545 }
__device__ int64_t atomicMin64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_min_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 983 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

985  {
986  if (val != skip_val) {
987  atomicMin64SkipVal(agg, val, skip_val);
988  }
989 }
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 523 of file cuda_mapd_rt.cu.

References atomicAdd().

523  {
524  atomicAdd(reinterpret_cast<double*>(agg), val);
525 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1089 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

1091  {
1092  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1093  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1094  }
1095 }
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 519 of file cuda_mapd_rt.cu.

References atomicAdd().

519  {
520  atomicAdd(reinterpret_cast<float*>(agg), val);
521 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_sum_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1071 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

1073  {
1074  if (__float_as_int(val) != __float_as_int(skip_val)) {
1075  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
1076  }
1077 }
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 515 of file cuda_mapd_rt.cu.

References atomicAdd().

515  {
516  return atomicAdd(agg, val);
517 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 938 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

940  {
941  if (val != skip_val) {
942  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
943  return old;
944  }
945  return 0;
946 }
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 511 of file cuda_mapd_rt.cu.

References atomicAdd().

511  {
512  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
513 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 957 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

959  {
960  if (val != skip_val) {
961  return atomicSum64SkipVal(agg, val, skip_val);
962  }
963  return 0;
964 }
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t* alloc_shared_mem_dynamic ( )

Dynamically allocates shared memory per block. The amount of shared memory allocated is defined at kernel launch time. Returns a pointer to the beginning of allocated shared memory

Definition at line 51 of file cuda_mapd_rt.cu.

Referenced by init_shared_mem_dynamic().

51  {
52  extern __shared__ int64_t groups_buffer_smem[];
53  return groups_buffer_smem;
54 }

+ Here is the caller graph for this function:

__device__ double atomicAdd ( double *  address,
double  val 
)

Definition at line 420 of file cuda_mapd_rt.cu.

Referenced by agg_count_int32_shared(), agg_count_shared(), agg_from_smem_to_gmem_binId_count(), agg_from_smem_to_gmem_count_binId(), agg_sum_double_shared(), agg_sum_float_shared(), agg_sum_int32_shared(), agg_sum_shared(), atomicSum32SkipVal(), atomicSum64SkipVal(), atomicSumDblSkipVal(), atomicSumFltSkipVal(), and get_matching_group_value().

420  {
421  unsigned long long int* address_as_ull = (unsigned long long int*)address;
422  unsigned long long int old = *address_as_ull, assumed;
423 
424  do {
425  assumed = old;
426  old = atomicCAS(address_as_ull,
427  assumed,
428  __double_as_longlong(val + __longlong_as_double(assumed)));
429 
430  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
431  } while (assumed != old);
432 
433  return __longlong_as_double(old);
434 }

+ Here is the caller graph for this function:

__device__ double atomicMax ( double *  address,
double  val 
)

Definition at line 437 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

437  {
438  unsigned long long int* address_as_ull = (unsigned long long int*)address;
439  unsigned long long int old = *address_as_ull, assumed;
440 
441  do {
442  assumed = old;
443  old = atomicCAS(address_as_ull,
444  assumed,
445  __double_as_longlong(max(val, __longlong_as_double(assumed))));
446 
447  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
448  } while (assumed != old);
449 
450  return __longlong_as_double(old);
451 }

+ Here is the caller graph for this function:

__device__ float atomicMax ( float *  address,
float  val 
)

Definition at line 453 of file cuda_mapd_rt.cu.

453  {
454  int* address_as_int = (int*)address;
455  int old = *address_as_int, assumed;
456 
457  do {
458  assumed = old;
459  old = atomicCAS(
460  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
461 
462  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
463  } while (assumed != old);
464 
465  return __int_as_float(old);
466 }
__device__ void atomicMax16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 552 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

552  {
553  // properly align the input pointer:
554  unsigned int* base_address_u32 =
555  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
556 
557  unsigned int old_value = *base_address_u32;
558  unsigned int swap_value, compare_value;
559  do {
560  compare_value = old_value;
561  swap_value =
562  (reinterpret_cast<size_t>(agg) & 0x2)
563  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
564  << 16 |
565  (old_value & 0xFFFF)
566  : (old_value & 0xFFFF0000) |
567  static_cast<unsigned int>(
568  max(static_cast<int16_t>(old_value & 0xFFFF), val));
569  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
570  } while (old_value != compare_value);
571 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64 ( int64_t *  address,
int64_t  val 
)

Definition at line 393 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

393  {
394  unsigned long long int* address_as_ull = (unsigned long long int*)address;
395  unsigned long long int old = *address_as_ull, assumed;
396 
397  do {
398  assumed = old;
399  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
400  } while (assumed != old);
401 
402  return old;
403 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 991 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

993  {
994  unsigned long long int* address_as_ull =
995  reinterpret_cast<unsigned long long int*>(address);
996  unsigned long long int old = *address_as_ull, assumed;
997 
998  do {
999  assumed = old;
1000  old = atomicCAS(address_as_ull,
1001  assumed,
1002  assumed == skip_val ? val : max((long long)val, (long long)assumed));
1003  } while (assumed != old);
1004 
1005  return old;
1006 }

+ Here is the caller graph for this function:

__device__ void atomicMax8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 573 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

573  {
574  // properly align the input pointer:
575  unsigned int* base_address_u32 =
576  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
577 
578  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
579  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
580  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
581  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
582  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
583  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
584  unsigned int old_value = *base_address_u32;
585  unsigned int swap_value, compare_value;
586  do {
587  compare_value = old_value;
588  auto max_value = static_cast<unsigned int>(
589  // compare val with its corresponding bits in the compare_value
590  max(val,
591  static_cast<int8_t>(__byte_perm(
592  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
593  swap_value = __byte_perm(
594  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
595  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
596  } while (compare_value != old_value);
597 }

+ Here is the caller graph for this function:

__device__ double atomicMaxDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1127 of file cuda_mapd_rt.cu.

Referenced by agg_max_double_skip_val_shared().

1129  {
1130  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1131  unsigned long long int old = *address_as_ull;
1132  unsigned long long int skip_val_as_ull = *((unsigned long long int*)&skip_val);
1133  unsigned long long int assumed;
1134 
1135  do {
1136  assumed = old;
1137  old = atomicCAS(address_as_ull,
1138  assumed,
1139  assumed == skip_val_as_ull
1140  ? *((unsigned long long int*)&val)
1141  : __double_as_longlong(max(val, __longlong_as_double(assumed))));
1142  } while (assumed != old);
1143 
1144  return __longlong_as_double(old);
1145 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( double *  address,
double  val 
)

Definition at line 468 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

468  {
469  unsigned long long int* address_as_ull = (unsigned long long int*)address;
470  unsigned long long int old = *address_as_ull, assumed;
471 
472  do {
473  assumed = old;
474  old = atomicCAS(address_as_ull,
475  assumed,
476  __double_as_longlong(min(val, __longlong_as_double(assumed))));
477  } while (assumed != old);
478 
479  return __longlong_as_double(old);
480 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( float *  address,
float  val 
)

Definition at line 482 of file cuda_mapd_rt.cu.

482  {
483  int* address_as_ull = (int*)address;
484  int old = *address_as_ull, assumed;
485 
486  do {
487  assumed = old;
488  old = atomicCAS(
489  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
490  } while (assumed != old);
491 
492  return __int_as_float(old);
493 }
__device__ void atomicMin16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 599 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

599  {
600  // properly align the input pointer:
601  unsigned int* base_address_u32 =
602  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
603 
604  unsigned int old_value = *base_address_u32;
605  unsigned int swap_value, compare_value;
606  do {
607  compare_value = old_value;
608  swap_value =
609  (reinterpret_cast<size_t>(agg) & 0x2)
610  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
611  << 16 |
612  (old_value & 0xFFFF)
613  : (old_value & 0xFFFF0000) |
614  static_cast<unsigned int>(
615  min(static_cast<int16_t>(old_value & 0xFFFF), val));
616  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
617  } while (old_value != compare_value);
618 }

+ Here is the caller graph for this function:

__device__ void atomicMin16SkipVal ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 620 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

622  {
623  // properly align the input pointer:
624  unsigned int* base_address_u32 =
625  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
626 
627  unsigned int old_value = *base_address_u32;
628  unsigned int swap_value, compare_value;
629  do {
630  compare_value = old_value;
631  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
632  ? static_cast<int16_t>(old_value >> 16)
633  : static_cast<int16_t>(old_value & 0xFFFF);
634 
635  swap_value =
636  (reinterpret_cast<size_t>(agg) & 0x2)
637  ? static_cast<unsigned int>(
638  selected_old_val == skip_val ? val : min(selected_old_val, val))
639  << 16 |
640  (old_value & 0xFFFF)
641  : (old_value & 0xFFFF0000) |
642  static_cast<unsigned int>(
643  selected_old_val == skip_val ? val : min(selected_old_val, val));
644  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
645  } while (old_value != compare_value);
646 }

+ Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal ( int32_t *  address,
int32_t  val,
const int32_t  skip_val 
)

Definition at line 914 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

916  {
917  int32_t old = atomicExch(address, INT_MAX);
918  return atomicMin(address, old == skip_val ? val : min(old, val));
919 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64 ( int64_t *  address,
int64_t  val 
)

Definition at line 405 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

405  {
406  unsigned long long int* address_as_ull = (unsigned long long int*)address;
407  unsigned long long int old = *address_as_ull, assumed;
408 
409  do {
410  assumed = old;
411  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
412  } while (assumed != old);
413 
414  return old;
415 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 966 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

968  {
969  unsigned long long int* address_as_ull =
970  reinterpret_cast<unsigned long long int*>(address);
971  unsigned long long int old = *address_as_ull, assumed;
972 
973  do {
974  assumed = old;
975  old = atomicCAS(address_as_ull,
976  assumed,
977  assumed == skip_val ? val : min((long long)val, (long long)assumed));
978  } while (assumed != old);
979 
980  return old;
981 }

+ Here is the caller graph for this function:

__device__ void atomicMin8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 648 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

648  {
649  // properly align the input pointer:
650  unsigned int* base_address_u32 =
651  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
652 
653  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
654  unsigned int old_value = *base_address_u32;
655  unsigned int swap_value, compare_value;
656  do {
657  compare_value = old_value;
658  auto min_value = static_cast<unsigned int>(
659  min(val,
660  static_cast<int8_t>(__byte_perm(
661  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
662  swap_value = __byte_perm(
663  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
664  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
665  } while (compare_value != old_value);
666 }

+ Here is the caller graph for this function:

__device__ void atomicMin8SkipVal ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 668 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

670  {
671  // properly align the input pointer:
672  unsigned int* base_address_u32 =
673  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
674 
675  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
676  unsigned int old_value = *base_address_u32;
677  unsigned int swap_value, compare_value;
678  do {
679  compare_value = old_value;
680  int8_t selected_old_val = static_cast<int8_t>(
681  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
682  auto min_value = static_cast<unsigned int>(
683  selected_old_val == skip_val ? val : min(val, selected_old_val));
684  swap_value = __byte_perm(
685  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
686  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
687  } while (compare_value != old_value);
688 }

+ Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1097 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

1099  {
1100  unsigned long long int* address_as_ull =
1101  reinterpret_cast<unsigned long long int*>(address);
1102  unsigned long long int old = *address_as_ull;
1103  unsigned long long int skip_val_as_ull =
1104  *reinterpret_cast<const unsigned long long*>(&skip_val);
1105  unsigned long long int assumed;
1106 
1107  do {
1108  assumed = old;
1109  old = atomicCAS(address_as_ull,
1110  assumed,
1111  assumed == skip_val_as_ull
1112  ? *reinterpret_cast<unsigned long long*>(&val)
1113  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1114  } while (assumed != old);
1115 
1116  return __longlong_as_double(old);
1117 }

+ Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal ( int32_t *  address,
float  val,
const float  skip_val 
)

Definition at line 1049 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

1049  {
1050  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1051  return atomicMin(
1052  reinterpret_cast<float*>(address),
1053  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1054 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal ( int32_t *  address,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 929 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_sum_int32_skip_val_shared().

931  {
932  unsigned int* address_as_int = (unsigned int*)address;
933  int32_t old = atomicExch(address_as_int, 0);
934  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
935  return old == skip_val ? old2 : (old2 + old);
936 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal ( int64_t *  address,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 948 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_sum_skip_val_shared().

950  {
951  unsigned long long int* address_as_ull = (unsigned long long int*)address;
952  int64_t old = atomicExch(address_as_ull, 0);
953  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
954  return old == skip_val ? old2 : (old2 + old);
955 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal ( double *  address,
const double  val,
const double  skip_val 
)

Definition at line 1079 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_sum_double_skip_val_shared().

1081  {
1082  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1083  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1084  atomicAdd(
1085  address,
1086  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1087 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal ( float *  address,
const float  val,
const float  skip_val 
)

Definition at line 1064 of file cuda_mapd_rt.cu.

References atomicAdd().

Referenced by agg_sum_float_skip_val_shared().

1066  {
1067  float old = atomicExch(address, 0.f);
1068  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1069 }
__device__ double atomicAdd(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared ( int64_t *  agg,
const double  val,
const double  null_val 
)

Definition at line 761 of file cuda_mapd_rt.cu.

763  {
764  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
765  unsigned long long int old = *address_as_ull, assumed;
766 
767  if (val == null_val) {
768  return 0;
769  }
770 
771  do {
772  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
773  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
774  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
775  return 15;
776  } else {
777  break;
778  }
779  }
780 
781  assumed = old;
782  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
783  } while (assumed != old);
784 
785  return 0;
786 }
__device__ int32_t checked_single_agg_id_double_shared_slow ( int64_t *  agg,
const double *  valp,
const double  null_val 
)

Definition at line 793 of file cuda_mapd_rt.cu.

795  {
796  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
797  unsigned long long int old = *address_as_ull, assumed;
798  double val = *valp;
799 
800  if (val == null_val) {
801  return 0;
802  }
803 
804  do {
805  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
806  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
807  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
808  return 15;
809  } else {
810  break;
811  }
812  }
813 
814  assumed = old;
815  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
816  } while (assumed != old);
817 
818  return 0;
819 }
__device__ int32_t checked_single_agg_id_float_shared ( int32_t *  agg,
const float  val,
const float  null_val 
)

Definition at line 825 of file cuda_mapd_rt.cu.

827  {
828  int* address_as_ull = reinterpret_cast<int*>(agg);
829  int old = *address_as_ull, assumed;
830 
831  if (val == null_val) {
832  return 0;
833  }
834 
835  do {
836  if (old != __float_as_int(null_val)) {
837  if (old != __float_as_int(val)) {
838  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
839  return 15;
840  } else {
841  break;
842  }
843  }
844 
845  assumed = old;
846  old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
847  } while (assumed != old);
848 
849  return 0;
850 }
__device__ int32_t checked_single_agg_id_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  null_val 
)

Definition at line 718 of file cuda_mapd_rt.cu.

720  {
721  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
722  unsigned long long int old = *address_as_ull, assumed;
723 
724  if (val == null_val) {
725  return 0;
726  }
727 
728  do {
729  if (static_cast<int64_t>(old) != null_val) {
730  if (static_cast<int64_t>(old) != val) {
731  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
732  return 15;
733  } else {
734  break;
735  }
736  }
737 
738  assumed = old;
739  old = atomicCAS(address_as_ull, assumed, val);
740  } while (assumed != old);
741 
742  return 0;
743 }
__device__ bool dynamic_watchdog ( )

Definition at line 181 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

181  {
182  // check for dynamic watchdog, if triggered all threads return true
183  if (dw_cycle_budget == 0LL) {
184  return false; // Uninitialized watchdog can't check time
185  }
186  if (dw_abort == 1) {
187  return true; // Received host request to abort
188  }
189  uint32_t smid = get_smid();
190  if (smid >= 128) {
191  return false;
192  }
193  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
194  __shared__ volatile bool
195  dw_should_terminate; // all threads within a block should return together if
196  // watchdog criteria is met
197 
198  // thread 0 either initializes or read the initial clock cycle, the result is stored
199  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
200  // point in using more threads here.
201  if (threadIdx.x == 0) {
202  dw_block_cycle_start = 0LL;
203  int64_t cycle_count = static_cast<int64_t>(clock64());
204  // Make sure the block hasn't switched SMs
205  if (smid == get_smid()) {
206  dw_block_cycle_start = static_cast<int64_t>(
207  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
208  0ULL,
209  static_cast<unsigned long long>(cycle_count)));
210  }
211 
212  int64_t cycles = cycle_count - dw_block_cycle_start;
213  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
214  (cycles > dw_cycle_budget)) {
215  // Check if we're out of time on this particular SM
216  dw_should_terminate = true;
217  } else {
218  dw_should_terminate = false;
219  }
220  }
221  __syncthreads();
222  return dw_should_terminate;
223 }
__device__ int64_t dw_sm_cycle_start[128]
__device__ int64_t dw_cycle_budget
__inline__ __device__ uint32_t get_smid(void)
__device__ int32_t dw_abort

+ Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1327 of file cuda_mapd_rt.cu.

1327  {
1328  __threadfence_block();
1329 }
template<typename T = unsigned long long>
__device__ T get_empty_key ( )
inline

Definition at line 226 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

226  {
227  return EMPTY_KEY_64;
228 }
#define EMPTY_KEY_64
template<>
__device__ unsigned int get_empty_key ( )
inline

Definition at line 231 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

231  {
232  return EMPTY_KEY_32;
233 }
#define EMPTY_KEY_32
template<typename T >
__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const T *  key,
const uint32_t  key_count,
const uint32_t  row_size_quad 
)
inline

Definition at line 236 of file cuda_mapd_rt.cu.

References align_to_int64(), and atomicAdd().

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

240  {
241  const T empty_key = get_empty_key<T>();
242  uint32_t off = h * row_size_quad;
243  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
244  {
245  const T old = atomicCAS(row_ptr, empty_key, *key);
246  if (empty_key == old && key_count > 1) {
247  for (size_t i = 1; i <= key_count - 1; ++i) {
248  atomicExch(row_ptr + i, key[i]);
249  }
250  }
251  }
252  if (key_count > 1) {
253  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
254  // spin until the winning thread has finished writing the entire key and the init
255  // value
256  }
257  }
258  bool match = true;
259  for (uint32_t i = 0; i < key_count; ++i) {
260  if (row_ptr[i] != key[i]) {
261  match = false;
262  break;
263  }
264  }
265 
266  if (match) {
267  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
268  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
269  }
270  return NULL;
271 }
const int32_t groups_buffer_size return groups_buffer
__device__ double atomicAdd(double *address, double val)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad,
const int64_t *  init_vals 
)

Definition at line 273 of file cuda_mapd_rt.cu.

References get_matching_group_value().

279  {
280  switch (key_width) {
281  case 4:
283  h,
284  reinterpret_cast<const unsigned int*>(key),
285  key_count,
286  row_size_quad);
287  case 8:
289  h,
290  reinterpret_cast<const unsigned long long*>(key),
291  key_count,
292  row_size_quad);
293  default:
294  return NULL;
295  }
296 }
const int32_t groups_buffer_size return groups_buffer
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)

+ Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_qw_count,
const size_t  entry_count 
)

Definition at line 359 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64, and key_qw_count.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

364  {
365  uint32_t off = h;
366  {
367  const uint64_t old = atomicCAS(
368  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
369  if (EMPTY_KEY_64 == old) {
370  for (size_t i = 0; i < key_qw_count; ++i) {
371  groups_buffer[off] = key[i];
372  off += entry_count;
373  }
374  return &groups_buffer[off];
375  }
376  }
377  __syncthreads();
378  off = h;
379  for (size_t i = 0; i < key_qw_count; ++i) {
380  if (groups_buffer[off] != key[i]) {
381  return NULL;
382  }
383  off += entry_count;
384  }
385  return &groups_buffer[off];
386 }
const int32_t groups_buffer_size return groups_buffer
#define EMPTY_KEY_64
const int64_t const uint32_t const uint32_t key_qw_count

+ Here is the caller graph for this function:

template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const T *  key,
const uint32_t  key_count 
)

Definition at line 299 of file cuda_mapd_rt.cu.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

303  {
304  const T empty_key = get_empty_key<T>();
305  const uint64_t old =
306  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
307  // the winner thread proceeds with writing the rest fo the keys
308  if (old == empty_key) {
309  uint32_t offset = h + entry_count;
310  for (size_t i = 1; i < key_count; ++i) {
311  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
312  offset += entry_count;
313  }
314  }
315 
316  __threadfence();
317  // for all threads except the winning thread, memory content of the keys
318  // related to the hash offset are checked again. In case of a complete match
319  // the hash offset is returned, otherwise -1 is returned
320  if (old != empty_key) {
321  uint32_t offset = h;
322  for (uint32_t i = 0; i < key_count; ++i) {
323  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
324  return -1;
325  }
326  offset += entry_count;
327  }
328  }
329  return h;
330 }
const int32_t groups_buffer_size return groups_buffer

+ Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width 
)

Definition at line 333 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

338  {
339  switch (key_width) {
340  case 4:
343  entry_count,
344  h,
345  reinterpret_cast<const unsigned int*>(key),
346  key_count);
347  case 8:
350  entry_count,
351  h,
352  reinterpret_cast<const unsigned long long*>(key),
353  key_count);
354  default:
355  return -1;
356  }
357 }
const int32_t groups_buffer_size return groups_buffer
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)

+ Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void  )

Definition at line 163 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

163  {
164  uint32_t ret;
165  asm("mov.u32 %0, %%smid;" : "=r"(ret));
166  return ret;
167 }

+ Here is the caller graph for this function:

__device__ int32_t group_buff_idx_impl ( )

Definition at line 15 of file cuda_mapd_rt.cu.

References pos_start_impl().

15  {
16  return pos_start_impl(NULL);
17 }
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:11

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 36 of file cuda_mapd_rt.cu.

37  {
38  extern __shared__ int64_t fast_bins[];
39  if (threadIdx.x == 0) {
40  memcpy(fast_bins, groups_buffer, groups_buffer_size);
41  }
42  __syncthreads();
43  return fast_bins;
44 }
const int32_t groups_buffer_size return groups_buffer
__device__ const int64_t* init_shared_mem_dynamic ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Initialize dynamic shared memory:

  1. Allocates dynamic shared memory
  2. Set every allocated element to be equal to the 'identity element', by default zero.

Definition at line 77 of file cuda_mapd_rt.cu.

References alloc_shared_mem_dynamic(), and set_shared_mem_to_identity().

79  {
80  int64_t* groups_buffer_smem = alloc_shared_mem_dynamic();
81  set_shared_mem_to_identity(groups_buffer_smem, groups_buffer_size);
82  return groups_buffer_smem;
83 }
__device__ int64_t * alloc_shared_mem_dynamic()
Definition: cuda_mapd_rt.cu:51
__device__ void set_shared_mem_to_identity(int64_t *groups_buffer_smem, const int32_t groups_buffer_size, const int64_t identity_element=0)
Definition: cuda_mapd_rt.cu:61

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem_nop ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 27 of file cuda_mapd_rt.cu.

References groups_buffer.

29  {
30  return groups_buffer;
31 }
const int32_t groups_buffer_size return groups_buffer
__device__ void linear_probabilistic_count ( uint8_t *  bitmap,
const uint32_t  bitmap_bytes,
const uint8_t *  key_bytes,
const uint32_t  key_len 
)

Definition at line 1256 of file cuda_mapd_rt.cu.

1259  {
1260  const uint32_t bit_pos = MurmurHash1(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1261  const uint32_t word_idx = bit_pos / 32;
1262  const uint32_t bit_idx = bit_pos % 32;
1263  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1264 }
NEVER_INLINE DEVICE uint32_t MurmurHash1(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:20
__device__ int32_t pos_start_impl ( const int32_t *  row_index_resume)

Definition at line 11 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), group_buff_idx_impl(), and record_error_code().

11  {
12  return blockIdx.x * blockDim.x + threadIdx.x;
13 }

+ Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 19 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

19  {
20  return blockDim.x * gridDim.x;
21 }

+ Here is the caller graph for this function:

__device__ void set_shared_mem_to_identity ( int64_t *  groups_buffer_smem,
const int32_t  groups_buffer_size,
const int64_t  identity_element = 0 
)

Set the allocated shared memory elements to be equal to the 'identity_element'. groups_buffer_size: number of 64-bit elements in shared memory per thread-block NOTE: groups_buffer_size is in units of 64-bit elements.

Definition at line 61 of file cuda_mapd_rt.cu.

Referenced by init_shared_mem_dynamic().

64  {
65 #pragma unroll
66  for (int i = threadIdx.x; i < groups_buffer_size; i += blockDim.x) {
67  groups_buffer_smem[i] = identity_element;
68  }
69  __syncthreads();
70 }

+ Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS ( int64_t *  slot,
int64_t  new_val,
int64_t  init_val 
)

Definition at line 1157 of file cuda_mapd_rt.cu.

1159  {
1160  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1161  const auto empty_key =
1162  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1163  const auto new_val_cast =
1164  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1165 
1166  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1167  if (old_val == *empty_key) {
1168  return true;
1169  } else {
1170  return false;
1171  }
1172 }
__device__ bool slotEmptyKeyCAS_int16 ( int16_t *  slot,
int16_t  new_val,
int16_t  init_val 
)

Definition at line 1185 of file cuda_mapd_rt.cu.

1187  {
1188  unsigned int* base_slot_address =
1189  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1190  unsigned int old_value = *base_slot_address;
1191  unsigned int swap_value, compare_value;
1192  do {
1193  compare_value = old_value;
1194  // exit criteria: if init_val does not exist in the slot (some other thread has
1195  // succeeded)
1196  if (static_cast<unsigned int>(init_val) !=
1197  __byte_perm(
1198  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1199  return false;
1200  }
1201  swap_value = __byte_perm(compare_value,
1202  static_cast<unsigned int>(new_val),
1203  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1204  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1205  } while (compare_value != old_value);
1206  return true;
1207 }
__device__ bool slotEmptyKeyCAS_int32 ( int32_t *  slot,
int32_t  new_val,
int32_t  init_val 
)

Definition at line 1174 of file cuda_mapd_rt.cu.

1176  {
1177  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1178  unsigned int compare_value = static_cast<unsigned int>(init_val);
1179  unsigned int swap_value = static_cast<unsigned int>(new_val);
1180 
1181  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1182  return old_value == compare_value;
1183 }
__device__ bool slotEmptyKeyCAS_int8 ( int8_t *  slot,
int8_t  new_val,
int8_t  init_val 
)

Definition at line 1209 of file cuda_mapd_rt.cu.

1211  {
1212  // properly align the slot address:
1213  unsigned int* base_slot_address =
1214  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1215  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1216  unsigned int old_value = *base_slot_address;
1217  unsigned int swap_value, compare_value;
1218  do {
1219  compare_value = old_value;
1220  // exit criteria: if init_val does not exist in the slot (some other thread has
1221  // succeeded)
1222  if (static_cast<unsigned int>(init_val) !=
1223  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1224  return false;
1225  }
1226  swap_value = __byte_perm(compare_value,
1227  static_cast<unsigned int>(new_val),
1228  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1229  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1230  } while (compare_value != old_value);
1231  return true;
1232 }
__device__ uint64_t string_decode ( int8_t *  chunk_iter_,
int64_t  pos 
)

Definition at line 1245 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

1245  {
1246  // TODO(alex): de-dup, the x64 version is basically identical
1247  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1248  VarlenDatum vd;
1249  bool is_end;
1250  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1251  return vd.is_null ? 0
1252  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
1253  (static_cast<uint64_t>(vd.length) << 48);
1254 }
bool is_null
Definition: sqltypes.h:76
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
int8_t * pointer
Definition: sqltypes.h:75
size_t length
Definition: sqltypes.h:74

+ Here is the call graph for this function:

__device__ void sync_warp ( )

Definition at line 1331 of file cuda_mapd_rt.cu.

1331  {
1332 #if (CUDA_VERSION >= 9000)
1333  __syncwarp();
1334 #endif
1335 }
__device__ void sync_warp_protected ( int64_t  thread_pos,
int64_t  row_count 
)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1344 of file cuda_mapd_rt.cu.

1344  {
1345 #if (CUDA_VERSION >= 9000)
1346  // only syncing if NOT within the same warp as those threads experiencing the critical
1347  // edge
1348  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1349  __syncwarp();
1350  }
1351 #endif
1352 }
__device__ int8_t thread_warp_idx ( const int8_t  warp_sz)

Definition at line 23 of file cuda_mapd_rt.cu.

23  {
24  return threadIdx.x % warp_sz;
25 }
__device__ void write_back ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 85 of file cuda_mapd_rt.cu.

85  {
86  __syncthreads();
87  if (threadIdx.x == 0) {
88  memcpy(dest, src, sz);
89  }
90 }
int64_t * src
__device__ void write_back_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 33 of file cuda_mapd_rt.cu.

33  {
34 }
__device__ void write_back_smem_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 92 of file cuda_mapd_rt.cu.

94  {}

Variable Documentation

__device__ int32_t dw_abort = 0

Definition at line 161 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), dynamic_watchdog_init(), and Executor::interrupt().

__device__ int64_t dw_cycle_budget = 0

Definition at line 160 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), and dynamic_watchdog_init().

__device__ int64_t dw_sm_cycle_start[128]

Definition at line 158 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().