OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cuda_mapd_rt.cu File Reference
#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "TableFunctions/TableFunctions.hpp"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "GeoOps.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"
+ Include dependency graph for cuda_mapd_rt.cu:

Go to the source code of this file.

Macros

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu
 
#define DEF_AGG_ID_INT_SHARED(n)
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   int64_t
 
#define ADDR_T   uint64_t
 
#define DATA_T   int32_t
 
#define ADDR_T   uint32_t
 
#define DEF_SKIP_AGG(base_agg_func)
 
#define DATA_T   double
 
#define ADDR_T   uint64_t
 
#define DATA_T   float
 
#define ADDR_T   uint32_t
 
#define EXECUTE_INCLUDE
 

Functions

__device__ int64_t get_thread_index ()
 
__device__ int64_t get_block_index ()
 
__device__ int32_t pos_start_impl (const int32_t *row_index_resume)
 
__device__ int32_t group_buff_idx_impl ()
 
__device__ int32_t pos_step_impl ()
 
__device__ int8_t thread_warp_idx (const int8_t warp_sz)
 
__device__ const int64_t * init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)
 
__device__ void write_back_nop (int64_t *dest, int64_t *src, const int32_t sz)
 
__device__ int64_t * declare_dynamic_shared_memory ()
 
__device__ const int64_t * init_shared_mem (const int64_t *global_groups_buffer, const int32_t groups_buffer_size)
 
__inline__ __device__ uint32_t get_smid (void)
 
__device__ bool dynamic_watchdog ()
 
__device__ bool check_interrupt ()
 
template<typename T = unsigned long long>
__device__ T get_empty_key ()
 
template<>
__device__ unsigned int get_empty_key ()
 
template<typename T >
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)
 
__device__ int64_t * get_matching_group_value (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad)
 
template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)
 
__device__ int32_t get_matching_group_value_columnar_slot (int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t *key, const uint32_t key_count, const uint32_t key_width)
 
__device__ int64_t * get_matching_group_value_columnar (int64_t *groups_buffer, const uint32_t h, const int64_t *key, const uint32_t key_qw_count, const size_t entry_count)
 
__device__ int64_t atomicMax64 (int64_t *address, int64_t val)
 
__device__ int64_t atomicMin64 (int64_t *address, int64_t val)
 
__device__ double atomicMax (double *address, double val)
 
__device__ float atomicMax (float *address, float val)
 
__device__ double atomicMin (double *address, double val)
 
__device__ double atomicMin (float *address, float val)
 
__device__ uint64_t agg_count_shared (uint64_t *agg, const int64_t val)
 
__device__ uint32_t agg_count_int32_shared (uint32_t *agg, const int32_t val)
 
__device__ uint64_t agg_count_double_shared (uint64_t *agg, const double val)
 
__device__ uint32_t agg_count_float_shared (uint32_t *agg, const float val)
 
__device__ int64_t agg_sum_shared (int64_t *agg, const int64_t val)
 
__device__ int32_t agg_sum_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_sum_float_shared (int32_t *agg, const float val)
 
__device__ void agg_sum_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_max_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void agg_max_double_shared (int64_t *agg, const double val)
 
__device__ void agg_max_float_shared (int32_t *agg, const float val)
 
__device__ void agg_min_shared (int64_t *agg, const int64_t val)
 
__device__ void agg_min_int32_shared (int32_t *agg, const int32_t val)
 
__device__ void atomicMax16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMax8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin16 (int16_t *agg, const int16_t val)
 
__device__ void atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void atomicMin8 (int8_t *agg, const int8_t val)
 
__device__ void atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_max_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_max_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_int16_shared (int16_t *agg, const int16_t val)
 
__device__ void agg_min_int8_shared (int8_t *agg, const int8_t val)
 
__device__ void agg_min_double_shared (int64_t *agg, const double val)
 
__device__ void agg_min_float_shared (int32_t *agg, const float val)
 
__device__ void agg_id_shared (int64_t *agg, const int64_t val)
 
__device__ int8_t * agg_id_varlen_shared (int8_t *varlen_buffer, const int64_t offset, const int8_t *value, const int64_t size_bytes)
 
__device__ int32_t checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)
 
__device__ void agg_id_double_shared (int64_t *agg, const double val)
 
__device__ int32_t checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)
 
__device__ void agg_id_double_shared_slow (int64_t *agg, const double *val)
 
__device__ int32_t checked_single_agg_id_double_shared_slow (int64_t *agg, const double *valp, const double null_val)
 
__device__ void agg_id_float_shared (int32_t *agg, const float val)
 
__device__ int32_t checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)
 
__device__ void agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ void agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)
 
__device__ void agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ void agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)
 
__device__ int32_t atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)
 
__device__ void agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int32_t atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)
 
__device__ int32_t agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)
 
__device__ int64_t atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)
 
__device__ int64_t agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ int64_t atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)
 
__device__ void agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)
 
__device__ void agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ float atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)
 
__device__ void agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumFltSkipVal (float *address, const float val, const float skip_val)
 
__device__ void agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)
 
__device__ void atomicSumDblSkipVal (double *address, const double val, const double skip_val)
 
__device__ void agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ double atomicMinDblSkipVal (double *address, double val, const double skip_val)
 
__device__ void agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ void agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)
 
__device__ bool slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)
 
__device__ bool slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)
 
__device__ bool slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)
 
__device__ bool slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)
 
__device__ uint64_t string_decode (int8_t *chunk_iter_, int64_t pos)
 
__device__ void linear_probabilistic_count (uint8_t *bitmap, const uint32_t bitmap_bytes, const uint8_t *key_bytes, const uint32_t key_len)
 
__device__ void agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)
 
__device__ void agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)
 
__device__ void force_sync ()
 
__device__ void sync_warp ()
 
__device__ void sync_warp_protected (int64_t thread_pos, int64_t row_count)
 
__device__ void sync_threadblock ()
 
__device__ void write_back_non_grouped_agg (int64_t *input_buffer, int64_t *output_buffer, const int32_t agg_idx)
 

Variables

__device__ int64_t dw_sm_cycle_start [128]
 
__device__ int64_t dw_cycle_budget = 0
 
__device__ int32_t dw_abort = 0
 
__device__ int32_t runtime_interrupt_flag = 0
 

Macro Definition Documentation

#define ADDR_T   uint64_t

Definition at line 1006 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1006 of file cuda_mapd_rt.cu.

#define ADDR_T   uint64_t

Definition at line 1006 of file cuda_mapd_rt.cu.

#define ADDR_T   uint32_t

Definition at line 1006 of file cuda_mapd_rt.cu.

#define DATA_T   int64_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define DATA_T   int32_t

Definition at line 1005 of file cuda_mapd_rt.cu.

#define DATA_T   double

Definition at line 1005 of file cuda_mapd_rt.cu.

#define DATA_T   float

Definition at line 1005 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED (   n)
Value:
extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg, \
const int##n##_t val) { \
*agg = val; \
}
constexpr double n
Definition: Utm.h:40

Definition at line 718 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return 0; \
}
#define DATA_T
#define ADDR_T
if(yyssp >=yyss+yystacksize-1)

Definition at line 990 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG (   base_agg_func)
Value:
extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
ADDR_T* agg, const DATA_T val, const DATA_T skip_val) { \
if (val != skip_val) { \
return base_agg_func##_shared(agg, val); \
} \
return *agg; \
}
#define DATA_T
#define ADDR_T
if(yyssp >=yyss+yystacksize-1)

Definition at line 990 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1195 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl   init_group_by_buffer_gpu

Definition at line 83 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu ( int64_t *  agg,
const int64_t  key,
const uint32_t  b,
const int64_t  base_dev_addr,
const int64_t  base_host_addr 
)

Definition at line 1272 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

1277  {
1278  const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
1279  const uint32_t index = hash >> (64 - b);
1280  const int32_t rank = get_rank(hash << b, 64 - b);
1281  const int64_t host_addr = *agg;
1282  int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
1283  atomicMax(&M[index], rank);
1284 }
FORCE_INLINE uint8_t get_rank(uint64_t x, uint32_t b)
RUNTIME_EXPORT NEVER_INLINE DEVICE uint64_t MurmurHash64A(const void *key, int len, uint64_t seed)
Definition: MurmurHash.cpp:27
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1225 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

1231  {
1232  const uint64_t bitmap_idx = val - min_val;
1233  const uint32_t byte_idx = bitmap_idx >> 3;
1234  const uint32_t word_idx = byte_idx >> 2;
1235  const uint32_t byte_word_idx = byte_idx & 3;
1236  const int64_t host_addr = *agg;
1237  uint32_t* bitmap = (uint32_t*)(base_dev_addr + host_addr - base_host_addr +
1238  (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes);
1239  switch (byte_word_idx) {
1240  case 0:
1241  atomicOr(&bitmap[word_idx], 1 << (bitmap_idx & 7));
1242  break;
1243  case 1:
1244  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 8));
1245  break;
1246  case 2:
1247  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 16));
1248  break;
1249  case 3:
1250  atomicOr(&bitmap[word_idx], 1 << ((bitmap_idx & 7) + 24));
1251  break;
1252  default:
1253  break;
1254  }
1255 }

+ Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu ( int64_t *  agg,
const int64_t  val,
const int64_t  min_val,
const int64_t  skip_val,
const int64_t  base_dev_addr,
const int64_t  base_host_addr,
const uint64_t  sub_bitmap_count,
const uint64_t  bitmap_bytes 
)

Definition at line 1257 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

1265  {
1266  if (val != skip_val) {
1268  agg, val, min_val, base_dev_addr, base_host_addr, sub_bitmap_count, bitmap_bytes);
1269  }
1270 }
__device__ void agg_count_distinct_bitmap_gpu(int64_t *agg, const int64_t val, const int64_t min_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

+ Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared ( uint64_t *  agg,
const double  val 
)

Definition at line 439 of file cuda_mapd_rt.cu.

References agg_count_shared().

439  {
440  return agg_count_shared(agg, val);
441 }
__device__ uint64_t agg_count_shared(uint64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared ( uint32_t *  agg,
const float  val 
)

Definition at line 443 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

443  {
444  return agg_count_int32_shared(agg, val);
445 }
__device__ uint32_t agg_count_int32_shared(uint32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ uint32_t agg_count_int32_shared ( uint32_t *  agg,
const int32_t  val 
)

Definition at line 435 of file cuda_mapd_rt.cu.

Referenced by agg_count_float_shared().

435  {
436  return atomicAdd(agg, 1UL);
437 }

+ Here is the caller graph for this function:

__device__ uint64_t agg_count_shared ( uint64_t *  agg,
const int64_t  val 
)

Definition at line 431 of file cuda_mapd_rt.cu.

Referenced by agg_count_double_shared().

431  {
432  return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1UL));
433 }

+ Here is the caller graph for this function:

__device__ void agg_id_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 730 of file cuda_mapd_rt.cu.

730  {
731  *agg = *(reinterpret_cast<const int64_t*>(&val));
732 }
__device__ void agg_id_double_shared_slow ( int64_t *  agg,
const double *  val 
)

Definition at line 761 of file cuda_mapd_rt.cu.

761  {
762  *agg = *(reinterpret_cast<const int64_t*>(val));
763 }
__device__ void agg_id_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 794 of file cuda_mapd_rt.cu.

794  {
795  *agg = __float_as_int(val);
796 }
__device__ void agg_id_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 677 of file cuda_mapd_rt.cu.

677  {
678  *agg = val;
679 }
__device__ int8_t* agg_id_varlen_shared ( int8_t *  varlen_buffer,
const int64_t  offset,
const int8_t *  value,
const int64_t  size_bytes 
)

Definition at line 681 of file cuda_mapd_rt.cu.

References i.

684  {
685  for (auto i = 0; i < size_bytes; i++) {
686  varlen_buffer[offset + i] = value[i];
687  }
688  return &varlen_buffer[offset];
689 }
__device__ void agg_max_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 471 of file cuda_mapd_rt.cu.

References atomicMax().

471  {
472  atomicMax(reinterpret_cast<double*>(agg), val);
473 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1100 of file cuda_mapd_rt.cu.

References atomicMax().

1102  {
1103  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1104  double old = __longlong_as_double(atomicExch(
1105  reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
1106  atomicMax(reinterpret_cast<double*>(agg),
1107  __double_as_longlong(old) == __double_as_longlong(skip_val)
1108  ? val
1109  : fmax(old, val));
1110  }
1111 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 475 of file cuda_mapd_rt.cu.

References atomicMax().

475  {
476  atomicMax(reinterpret_cast<float*>(agg), val);
477 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1012 of file cuda_mapd_rt.cu.

References atomicMax().

1014  {
1015  if (__float_as_int(val) != __float_as_int(skip_val)) {
1016  float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
1017  atomicMax(reinterpret_cast<float*>(agg),
1018  __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
1019  }
1020 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_max_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 653 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

653  {
654  return atomicMax16(agg, val);
655 }
__device__ void atomicMax16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 855 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

857  {
858  if (val != skip_val) {
859  agg_max_int16_shared(agg, val);
860  }
861 }
__device__ void agg_max_int16_shared(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 467 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

467  {
468  atomicMax(agg, val);
469 }
__device__ double atomicMax(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 847 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

849  {
850  if (val != skip_val) {
851  agg_max_int32_shared(agg, val);
852  }
853 }
__device__ void agg_max_int32_shared(int32_t *agg, const int32_t val)

+ Here is the call graph for this function:

__device__ void agg_max_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 657 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

657  {
658  return atomicMax8(agg, val);
659 }
__device__ void atomicMax8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 871 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

873  {
874  if (val != skip_val) {
875  agg_max_int8_shared(agg, val);
876  }
877 }
__device__ void agg_max_int8_shared(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_max_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 463 of file cuda_mapd_rt.cu.

References atomicMax64().

463  {
464  atomicMax64(agg, val);
465 }
__device__ int64_t atomicMax64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_max_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 981 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

983  {
984  if (val != skip_val) {
985  atomicMax64SkipVal(agg, val, skip_val);
986  }
987 }
__device__ int64_t atomicMax64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 669 of file cuda_mapd_rt.cu.

References atomicMin().

669  {
670  atomicMin(reinterpret_cast<double*>(agg), val);
671 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1092 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

1094  {
1095  if (val != skip_val) {
1096  atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1097  }
1098 }
__device__ double atomicMinDblSkipVal(double *address, double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 673 of file cuda_mapd_rt.cu.

References atomicMin().

673  {
674  atomicMin(reinterpret_cast<float*>(agg), val);
675 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1029 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

1031  {
1032  if (__float_as_int(val) != __float_as_int(skip_val)) {
1033  atomicMinFltSkipVal(agg, val, skip_val);
1034  }
1035 }
__device__ float atomicMinFltSkipVal(int32_t *address, float val, const float skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_shared ( int16_t *  agg,
const int16_t  val 
)

Definition at line 661 of file cuda_mapd_rt.cu.

References atomicMin16().

661  {
662  return atomicMin16(agg, val);
663 }
__device__ void atomicMin16(int16_t *agg, const int16_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 863 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

865  {
866  if (val != skip_val) {
867  atomicMin16SkipVal(agg, val, skip_val);
868  }
869 }
__device__ void atomicMin16SkipVal(int16_t *agg, const int16_t val, const int16_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 483 of file cuda_mapd_rt.cu.

References atomicMin().

483  {
484  atomicMin(agg, val);
485 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 894 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

896  {
897  if (val != skip_val) {
898  atomicMin32SkipVal(agg, val, skip_val);
899  }
900 }
__device__ int32_t atomicMin32SkipVal(int32_t *address, int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_shared ( int8_t *  agg,
const int8_t  val 
)

Definition at line 665 of file cuda_mapd_rt.cu.

References atomicMin8().

665  {
666  return atomicMin8(agg, val);
667 }
__device__ void atomicMin8(int8_t *agg, const int8_t val)

+ Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 879 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

881  {
882  if (val != skip_val) {
883  atomicMin8SkipVal(agg, val, skip_val);
884  }
885 }
__device__ void atomicMin8SkipVal(int8_t *agg, const int8_t val, const int8_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_min_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 479 of file cuda_mapd_rt.cu.

References atomicMin64().

479  {
480  atomicMin64(agg, val);
481 }
__device__ int64_t atomicMin64(int64_t *address, int64_t val)

+ Here is the call graph for this function:

__device__ void agg_min_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 956 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

958  {
959  if (val != skip_val) {
960  atomicMin64SkipVal(agg, val, skip_val);
961  }
962 }
__device__ int64_t atomicMin64SkipVal(int64_t *address, int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_double_shared ( int64_t *  agg,
const double  val 
)

Definition at line 459 of file cuda_mapd_rt.cu.

459  {
460  atomicAdd(reinterpret_cast<double*>(agg), val);
461 }
__device__ void agg_sum_double_skip_val_shared ( int64_t *  agg,
const double  val,
const double  skip_val 
)

Definition at line 1062 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

1064  {
1065  if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
1066  atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
1067  }
1068 }
__device__ void atomicSumDblSkipVal(double *address, const double val, const double skip_val)

+ Here is the call graph for this function:

__device__ void agg_sum_float_shared ( int32_t *  agg,
const float  val 
)

Definition at line 455 of file cuda_mapd_rt.cu.

455  {
456  atomicAdd(reinterpret_cast<float*>(agg), val);
457 }
__device__ void agg_sum_float_skip_val_shared ( int32_t *  agg,
const float  val,
const float  skip_val 
)

Definition at line 1044 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

1046  {
1047  if (__float_as_int(val) != __float_as_int(skip_val)) {
1048  atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
1049  }
1050 }
__device__ void atomicSumFltSkipVal(float *address, const float val, const float skip_val)

+ Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared ( int32_t *  agg,
const int32_t  val 
)

Definition at line 451 of file cuda_mapd_rt.cu.

451  {
452  return atomicAdd(agg, val);
453 }
__device__ int32_t agg_sum_int32_skip_val_shared ( int32_t *  agg,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 911 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

913  {
914  if (val != skip_val) {
915  const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
916  return old;
917  }
918  return 0;
919 }
__device__ int32_t atomicSum32SkipVal(int32_t *address, const int32_t val, const int32_t skip_val)

+ Here is the call graph for this function:

__device__ int64_t agg_sum_shared ( int64_t *  agg,
const int64_t  val 
)

Definition at line 447 of file cuda_mapd_rt.cu.

Referenced by write_back_non_grouped_agg().

447  {
448  return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
449 }

+ Here is the caller graph for this function:

__device__ int64_t agg_sum_skip_val_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 930 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

932  {
933  if (val != skip_val) {
934  return atomicSum64SkipVal(agg, val, skip_val);
935  }
936  return 0;
937 }
__device__ int64_t atomicSum64SkipVal(int64_t *address, const int64_t val, const int64_t skip_val)

+ Here is the call graph for this function:

__device__ double atomicMax ( double *  address,
double  val 
)

Definition at line 373 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_double_skip_val_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

373  {
374  unsigned long long int* address_as_ull = (unsigned long long int*)address;
375  unsigned long long int old = *address_as_ull, assumed;
376 
377  do {
378  assumed = old;
379  old = atomicCAS(address_as_ull,
380  assumed,
381  __double_as_longlong(max(val, __longlong_as_double(assumed))));
382 
383  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
384  } while (assumed != old);
385 
386  return __longlong_as_double(old);
387 }

+ Here is the caller graph for this function:

__device__ float atomicMax ( float *  address,
float  val 
)

Definition at line 389 of file cuda_mapd_rt.cu.

389  {
390  int* address_as_int = (int*)address;
391  int old = *address_as_int, assumed;
392 
393  do {
394  assumed = old;
395  old = atomicCAS(
396  address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
397 
398  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
399  } while (assumed != old);
400 
401  return __int_as_float(old);
402 }
__device__ void atomicMax16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 501 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

501  {
502  // properly align the input pointer:
503  unsigned int* base_address_u32 =
504  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
505 
506  unsigned int old_value = *base_address_u32;
507  unsigned int swap_value, compare_value;
508  do {
509  compare_value = old_value;
510  swap_value =
511  (reinterpret_cast<size_t>(agg) & 0x2)
512  ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
513  << 16 |
514  (old_value & 0xFFFF)
515  : (old_value & 0xFFFF0000) |
516  static_cast<unsigned int>(
517  max(static_cast<int16_t>(old_value & 0xFFFF), val));
518  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
519  } while (old_value != compare_value);
520 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64 ( int64_t *  address,
int64_t  val 
)

Definition at line 331 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

331  {
332  unsigned long long int* address_as_ull = (unsigned long long int*)address;
333  unsigned long long int old = *address_as_ull, assumed;
334 
335  do {
336  assumed = old;
337  old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
338  } while (assumed != old);
339 
340  return old;
341 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 964 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

966  {
967  unsigned long long int* address_as_ull =
968  reinterpret_cast<unsigned long long int*>(address);
969  unsigned long long int old = *address_as_ull, assumed;
970 
971  do {
972  assumed = old;
973  old = atomicCAS(address_as_ull,
974  assumed,
975  assumed == skip_val ? val : max((long long)val, (long long)assumed));
976  } while (assumed != old);
977 
978  return old;
979 }

+ Here is the caller graph for this function:

__device__ void atomicMax8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 523 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

523  {
524  // properly align the input pointer:
525  unsigned int* base_address_u32 =
526  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
527 
528  // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
529  // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
530  // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
531  // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
532  // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
533  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
534  unsigned int old_value = *base_address_u32;
535  unsigned int swap_value, compare_value;
536  do {
537  compare_value = old_value;
538  auto max_value = static_cast<unsigned int>(
539  // compare val with its corresponding bits in the compare_value
540  max(val,
541  static_cast<int8_t>(__byte_perm(
542  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
543  swap_value = __byte_perm(
544  compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
545  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
546  } while (compare_value != old_value);
547 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( double *  address,
double  val 
)

Definition at line 404 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

404  {
405  unsigned long long int* address_as_ull = (unsigned long long int*)address;
406  unsigned long long int old = *address_as_ull, assumed;
407 
408  do {
409  assumed = old;
410  old = atomicCAS(address_as_ull,
411  assumed,
412  __double_as_longlong(min(val, __longlong_as_double(assumed))));
413  } while (assumed != old);
414 
415  return __longlong_as_double(old);
416 }

+ Here is the caller graph for this function:

__device__ double atomicMin ( float *  address,
float  val 
)

Definition at line 418 of file cuda_mapd_rt.cu.

418  {
419  int* address_as_ull = (int*)address;
420  int old = *address_as_ull, assumed;
421 
422  do {
423  assumed = old;
424  old = atomicCAS(
425  address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
426  } while (assumed != old);
427 
428  return __int_as_float(old);
429 }
__device__ void atomicMin16 ( int16_t *  agg,
const int16_t  val 
)

Definition at line 563 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

563  {
564  // properly align the input pointer:
565  unsigned int* base_address_u32 =
566  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
567 
568  unsigned int old_value = *base_address_u32;
569  unsigned int swap_value, compare_value;
570  do {
571  compare_value = old_value;
572  swap_value =
573  (reinterpret_cast<size_t>(agg) & 0x2)
574  ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
575  << 16 |
576  (old_value & 0xFFFF)
577  : (old_value & 0xFFFF0000) |
578  static_cast<unsigned int>(
579  min(static_cast<int16_t>(old_value & 0xFFFF), val));
580  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
581  } while (old_value != compare_value);
582 }

+ Here is the caller graph for this function:

__device__ void atomicMin16SkipVal ( int16_t *  agg,
const int16_t  val,
const int16_t  skip_val 
)

Definition at line 585 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

587  {
588  // properly align the input pointer:
589  unsigned int* base_address_u32 =
590  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
591 
592  unsigned int old_value = *base_address_u32;
593  unsigned int swap_value, compare_value;
594  do {
595  compare_value = old_value;
596  int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
597  ? static_cast<int16_t>(old_value >> 16)
598  : static_cast<int16_t>(old_value & 0xFFFF);
599 
600  swap_value =
601  (reinterpret_cast<size_t>(agg) & 0x2)
602  ? static_cast<unsigned int>(
603  selected_old_val == skip_val ? val : min(selected_old_val, val))
604  << 16 |
605  (old_value & 0xFFFF)
606  : (old_value & 0xFFFF0000) |
607  static_cast<unsigned int>(
608  selected_old_val == skip_val ? val : min(selected_old_val, val));
609  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
610  } while (old_value != compare_value);
611 }

+ Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal ( int32_t *  address,
int32_t  val,
const int32_t  skip_val 
)

Definition at line 887 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

889  {
890  int32_t old = atomicExch(address, INT_MAX);
891  return atomicMin(address, old == skip_val ? val : min(old, val));
892 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64 ( int64_t *  address,
int64_t  val 
)

Definition at line 343 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

343  {
344  unsigned long long int* address_as_ull = (unsigned long long int*)address;
345  unsigned long long int old = *address_as_ull, assumed;
346 
347  do {
348  assumed = old;
349  old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
350  } while (assumed != old);
351 
352  return old;
353 }

+ Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal ( int64_t *  address,
int64_t  val,
const int64_t  skip_val 
)

Definition at line 939 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

941  {
942  unsigned long long int* address_as_ull =
943  reinterpret_cast<unsigned long long int*>(address);
944  unsigned long long int old = *address_as_ull, assumed;
945 
946  do {
947  assumed = old;
948  old = atomicCAS(address_as_ull,
949  assumed,
950  assumed == skip_val ? val : min((long long)val, (long long)assumed));
951  } while (assumed != old);
952 
953  return old;
954 }

+ Here is the caller graph for this function:

__device__ void atomicMin8 ( int8_t *  agg,
const int8_t  val 
)

Definition at line 613 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

613  {
614  // properly align the input pointer:
615  unsigned int* base_address_u32 =
616  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
617 
618  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
619  unsigned int old_value = *base_address_u32;
620  unsigned int swap_value, compare_value;
621  do {
622  compare_value = old_value;
623  auto min_value = static_cast<unsigned int>(
624  min(val,
625  static_cast<int8_t>(__byte_perm(
626  compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
627  swap_value = __byte_perm(
628  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
629  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
630  } while (compare_value != old_value);
631 }

+ Here is the caller graph for this function:

__device__ void atomicMin8SkipVal ( int8_t *  agg,
const int8_t  val,
const int8_t  skip_val 
)

Definition at line 633 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

633  {
634  // properly align the input pointer:
635  unsigned int* base_address_u32 =
636  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
637 
638  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
639  unsigned int old_value = *base_address_u32;
640  unsigned int swap_value, compare_value;
641  do {
642  compare_value = old_value;
643  int8_t selected_old_val = static_cast<int8_t>(
644  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
645  auto min_value = static_cast<unsigned int>(
646  selected_old_val == skip_val ? val : min(val, selected_old_val));
647  swap_value = __byte_perm(
648  compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
649  old_value = atomicCAS(base_address_u32, compare_value, swap_value);
650  } while (compare_value != old_value);
651 }

+ Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal ( double *  address,
double  val,
const double  skip_val 
)

Definition at line 1070 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

1072  {
1073  unsigned long long int* address_as_ull =
1074  reinterpret_cast<unsigned long long int*>(address);
1075  unsigned long long int old = *address_as_ull;
1076  unsigned long long int skip_val_as_ull =
1077  *reinterpret_cast<const unsigned long long*>(&skip_val);
1078  unsigned long long int assumed;
1079 
1080  do {
1081  assumed = old;
1082  old = atomicCAS(address_as_ull,
1083  assumed,
1084  assumed == skip_val_as_ull
1085  ? *reinterpret_cast<unsigned long long*>(&val)
1086  : __double_as_longlong(min(val, __longlong_as_double(assumed))));
1087  } while (assumed != old);
1088 
1089  return __longlong_as_double(old);
1090 }

+ Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal ( int32_t *  address,
float  val,
const float  skip_val 
)

Definition at line 1022 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

1022  {
1023  float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
1024  return atomicMin(
1025  reinterpret_cast<float*>(address),
1026  __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
1027 }
__device__ double atomicMin(double *address, double val)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal ( int32_t *  address,
const int32_t  val,
const int32_t  skip_val 
)

Definition at line 902 of file cuda_mapd_rt.cu.

Referenced by agg_sum_int32_skip_val_shared().

904  {
905  unsigned int* address_as_int = (unsigned int*)address;
906  int32_t old = atomicExch(address_as_int, 0);
907  int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
908  return old == skip_val ? old2 : (old2 + old);
909 }

+ Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal ( int64_t *  address,
const int64_t  val,
const int64_t  skip_val 
)

Definition at line 921 of file cuda_mapd_rt.cu.

Referenced by agg_sum_skip_val_shared().

923  {
924  unsigned long long int* address_as_ull = (unsigned long long int*)address;
925  int64_t old = atomicExch(address_as_ull, 0);
926  int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
927  return old == skip_val ? old2 : (old2 + old);
928 }

+ Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal ( double *  address,
const double  val,
const double  skip_val 
)

Definition at line 1052 of file cuda_mapd_rt.cu.

Referenced by agg_sum_double_skip_val_shared().

1054  {
1055  unsigned long long int* address_as_ull = (unsigned long long int*)address;
1056  double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
1057  atomicAdd(
1058  address,
1059  __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
1060 }

+ Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal ( float *  address,
const float  val,
const float  skip_val 
)

Definition at line 1037 of file cuda_mapd_rt.cu.

References f.

Referenced by agg_sum_float_skip_val_shared().

1039  {
1040  float old = atomicExch(address, 0.f);
1041  atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
1042 }
char * f

+ Here is the caller graph for this function:

__device__ bool check_interrupt ( )

Definition at line 160 of file cuda_mapd_rt.cu.

Referenced by check_interrupt_rt(), and ColumnFetcher::linearizeFixedLenArrayColFrags().

160  {
161  return (runtime_interrupt_flag == 1) ? true : false;
162 }
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:96

+ Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared ( int64_t *  agg,
const double  val,
const double  null_val 
)

Definition at line 734 of file cuda_mapd_rt.cu.

736  {
737  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
738  unsigned long long int old = *address_as_ull, assumed;
739 
740  if (val == null_val) {
741  return 0;
742  }
743 
744  do {
745  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
746  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
747  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
748  return 15;
749  } else {
750  break;
751  }
752  }
753 
754  assumed = old;
755  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
756  } while (assumed != old);
757 
758  return 0;
759 }
__device__ int32_t checked_single_agg_id_double_shared_slow ( int64_t *  agg,
const double *  valp,
const double  null_val 
)

Definition at line 766 of file cuda_mapd_rt.cu.

768  {
769  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
770  unsigned long long int old = *address_as_ull, assumed;
771  double val = *valp;
772 
773  if (val == null_val) {
774  return 0;
775  }
776 
777  do {
778  if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
779  if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
780  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
781  return 15;
782  } else {
783  break;
784  }
785  }
786 
787  assumed = old;
788  old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
789  } while (assumed != old);
790 
791  return 0;
792 }
__device__ int32_t checked_single_agg_id_float_shared ( int32_t *  agg,
const float  val,
const float  null_val 
)

Definition at line 798 of file cuda_mapd_rt.cu.

800  {
801  int* address_as_ull = reinterpret_cast<int*>(agg);
802  int old = *address_as_ull, assumed;
803 
804  if (val == null_val) {
805  return 0;
806  }
807 
808  do {
809  if (old != __float_as_int(null_val)) {
810  if (old != __float_as_int(val)) {
811  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
812  return 15;
813  } else {
814  break;
815  }
816  }
817 
818  assumed = old;
819  old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
820  } while (assumed != old);
821 
822  return 0;
823 }
__device__ int32_t checked_single_agg_id_shared ( int64_t *  agg,
const int64_t  val,
const int64_t  null_val 
)

Definition at line 691 of file cuda_mapd_rt.cu.

693  {
694  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
695  unsigned long long int old = *address_as_ull, assumed;
696 
697  if (val == null_val) {
698  return 0;
699  }
700 
701  do {
702  if (static_cast<int64_t>(old) != null_val) {
703  if (static_cast<int64_t>(old) != val) {
704  // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
705  return 15;
706  } else {
707  break;
708  }
709  }
710 
711  assumed = old;
712  old = atomicCAS(address_as_ull, assumed, val);
713  } while (assumed != old);
714 
715  return 0;
716 }
__device__ int64_t* declare_dynamic_shared_memory ( )

Definition at line 57 of file cuda_mapd_rt.cu.

57  {
58  extern __shared__ int64_t shared_mem_buffer[];
59  return shared_mem_buffer;
60 }
__device__ bool dynamic_watchdog ( )

Definition at line 116 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), anonymous_namespace{ResultSetReduction.cpp}::check_watchdog_with_seed(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

116  {
117  // check for dynamic watchdog, if triggered all threads return true
118  if (dw_cycle_budget == 0LL) {
119  return false; // Uninitialized watchdog can't check time
120  }
121  if (dw_abort == 1) {
122  return true; // Received host request to abort
123  }
124  uint32_t smid = get_smid();
125  if (smid >= 128) {
126  return false;
127  }
128  __shared__ volatile int64_t dw_block_cycle_start; // Thread block shared cycle start
129  __shared__ volatile bool
130  dw_should_terminate; // all threads within a block should return together if
131  // watchdog criteria is met
132 
133  // thread 0 either initializes or read the initial clock cycle, the result is stored
134  // into shared memory. Since all threads wihtin a block shares the same SM, there's no
135  // point in using more threads here.
136  if (threadIdx.x == 0) {
137  dw_block_cycle_start = 0LL;
138  int64_t cycle_count = static_cast<int64_t>(clock64());
139  // Make sure the block hasn't switched SMs
140  if (smid == get_smid()) {
141  dw_block_cycle_start = static_cast<int64_t>(
142  atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
143  0ULL,
144  static_cast<unsigned long long>(cycle_count)));
145  }
146 
147  int64_t cycles = cycle_count - dw_block_cycle_start;
148  if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
149  (cycles > dw_cycle_budget)) {
150  // Check if we're out of time on this particular SM
151  dw_should_terminate = true;
152  } else {
153  dw_should_terminate = false;
154  }
155  }
156  __syncthreads();
157  return dw_should_terminate;
158 }
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:92
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:94
__inline__ __device__ uint32_t get_smid(void)
Definition: cuda_mapd_rt.cu:98
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:95

+ Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1286 of file cuda_mapd_rt.cu.

1286  {
1287  __threadfence_block();
1288 }
__device__ int64_t get_block_index ( )

Definition at line 24 of file cuda_mapd_rt.cu.

24  {
25  return blockIdx.x;
26 }
template<typename T = unsigned long long>
__device__ T get_empty_key ( )
inline

Definition at line 165 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

165  {
166  return EMPTY_KEY_64;
167 }
#define EMPTY_KEY_64
template<>
__device__ unsigned int get_empty_key ( )
inline

Definition at line 170 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

170  {
171  return EMPTY_KEY_32;
172 }
#define EMPTY_KEY_32
template<typename T >
__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const T *  key,
const uint32_t  key_count,
const uint32_t  row_size_quad 
)
inline

Definition at line 175 of file cuda_mapd_rt.cu.

References align_to_int64(), i, and omnisci.dtypes::T.

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

179  {
180  const T empty_key = get_empty_key<T>();
181  uint32_t off = h * row_size_quad;
182  auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
183  {
184  const T old = atomicCAS(row_ptr, empty_key, *key);
185  if (empty_key == old && key_count > 1) {
186  for (size_t i = 1; i <= key_count - 1; ++i) {
187  atomicExch(row_ptr + i, key[i]);
188  }
189  }
190  }
191  if (key_count > 1) {
192  while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
193  // spin until the winning thread has finished writing the entire key and the init
194  // value
195  }
196  }
197  bool match = true;
198  for (uint32_t i = 0; i < key_count; ++i) {
199  if (row_ptr[i] != key[i]) {
200  match = false;
201  break;
202  }
203  }
204 
205  if (match) {
206  auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
207  return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
208  }
209  return NULL;
210 }
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width,
const uint32_t  row_size_quad 
)

Definition at line 212 of file cuda_mapd_rt.cu.

References get_matching_group_value().

217  {
218  switch (key_width) {
219  case 4:
220  return get_matching_group_value(groups_buffer,
221  h,
222  reinterpret_cast<const unsigned int*>(key),
223  key_count,
224  row_size_quad);
225  case 8:
226  return get_matching_group_value(groups_buffer,
227  h,
228  reinterpret_cast<const unsigned long long*>(key),
229  key_count,
230  row_size_quad);
231  default:
232  return NULL;
233  }
234 }
__device__ int64_t * get_matching_group_value(int64_t *groups_buffer, const uint32_t h, const T *key, const uint32_t key_count, const uint32_t row_size_quad)

+ Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar ( int64_t *  groups_buffer,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_qw_count,
const size_t  entry_count 
)

Definition at line 297 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64, and i.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

302  {
303  uint32_t off = h;
304  {
305  const uint64_t old = atomicCAS(
306  reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
307  if (EMPTY_KEY_64 == old) {
308  for (size_t i = 0; i < key_qw_count; ++i) {
309  groups_buffer[off] = key[i];
310  off += entry_count;
311  }
312  return &groups_buffer[off];
313  }
314  }
315  __syncthreads();
316  off = h;
317  for (size_t i = 0; i < key_qw_count; ++i) {
318  if (groups_buffer[off] != key[i]) {
319  return NULL;
320  }
321  off += entry_count;
322  }
323  return &groups_buffer[off];
324 }
#define EMPTY_KEY_64

+ Here is the caller graph for this function:

template<typename T >
__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const T *  key,
const uint32_t  key_count 
)

Definition at line 237 of file cuda_mapd_rt.cu.

References i, and omnisci.dtypes::T.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

241  {
242  const T empty_key = get_empty_key<T>();
243  const uint64_t old =
244  atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
245  // the winner thread proceeds with writing the rest fo the keys
246  if (old == empty_key) {
247  uint32_t offset = h + entry_count;
248  for (size_t i = 1; i < key_count; ++i) {
249  *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
250  offset += entry_count;
251  }
252  }
253 
254  __threadfence();
255  // for all threads except the winning thread, memory content of the keys
256  // related to the hash offset are checked again. In case of a complete match
257  // the hash offset is returned, otherwise -1 is returned
258  if (old != empty_key) {
259  uint32_t offset = h;
260  for (uint32_t i = 0; i < key_count; ++i) {
261  if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
262  return -1;
263  }
264  offset += entry_count;
265  }
266  }
267  return h;
268 }

+ Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot ( int64_t *  groups_buffer,
const uint32_t  entry_count,
const uint32_t  h,
const int64_t *  key,
const uint32_t  key_count,
const uint32_t  key_width 
)

Definition at line 271 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

276  {
277  switch (key_width) {
278  case 4:
280  groups_buffer,
281  entry_count,
282  h,
283  reinterpret_cast<const unsigned int*>(key),
284  key_count);
285  case 8:
287  groups_buffer,
288  entry_count,
289  h,
290  reinterpret_cast<const unsigned long long*>(key),
291  key_count);
292  default:
293  return -1;
294  }
295 }
__device__ int32_t get_matching_group_value_columnar_slot(int64_t *groups_buffer, const uint32_t entry_count, const uint32_t h, const T *key, const uint32_t key_count)

+ Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void  )

Definition at line 98 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

98  {
99  uint32_t ret;
100  asm("mov.u32 %0, %%smid;" : "=r"(ret));
101  return ret;
102 }

+ Here is the caller graph for this function:

__device__ int64_t get_thread_index ( )

Definition at line 20 of file cuda_mapd_rt.cu.

20  {
21  return threadIdx.x;
22 }
__device__ int32_t group_buff_idx_impl ( )

Definition at line 32 of file cuda_mapd_rt.cu.

References pos_start_impl().

32  {
33  return pos_start_impl(NULL);
34 }
__device__ int32_t pos_start_impl(const int32_t *row_index_resume)
Definition: cuda_mapd_rt.cu:28

+ Here is the call graph for this function:

__device__ const int64_t* init_shared_mem ( const int64_t *  global_groups_buffer,
const int32_t  groups_buffer_size 
)

Initializes the shared memory buffer for perfect hash group by. In this function, we simply copy the global group by buffer (already initialized on the host and transferred) to all shared memory group by buffers.

Definition at line 67 of file cuda_mapd_rt.cu.

68  {
69  // dynamic shared memory declaration
70  extern __shared__ int64_t shared_groups_buffer[];
71 
72  // it is assumed that buffer size is aligned with 64-bit units
73  // so it is safe to assign 64-bit to each thread
74  const int32_t buffer_units = groups_buffer_size >> 3;
75 
76  for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
77  shared_groups_buffer[pos] = global_groups_buffer[pos];
78  }
79  __syncthreads();
80  return shared_groups_buffer;
81 }
__device__ const int64_t* init_shared_mem_nop ( const int64_t *  groups_buffer,
const int32_t  groups_buffer_size 
)

Definition at line 44 of file cuda_mapd_rt.cu.

46  {
47  return groups_buffer;
48 }
__device__ void linear_probabilistic_count ( uint8_t *  bitmap,
const uint32_t  bitmap_bytes,
const uint8_t *  key_bytes,
const uint32_t  key_len 
)

Definition at line 1215 of file cuda_mapd_rt.cu.

1218  {
1219  const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
1220  const uint32_t word_idx = bit_pos / 32;
1221  const uint32_t bit_idx = bit_pos % 32;
1222  atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
1223 }
RUNTIME_EXPORT NEVER_INLINE DEVICE uint32_t MurmurHash3(const void *key, int len, const uint32_t seed)
Definition: MurmurHash.cpp:33
__device__ int32_t pos_start_impl ( const int32_t *  row_index_resume)

Definition at line 28 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), get_error_code(), group_buff_idx_impl(), and record_error_code().

28  {
29  return blockIdx.x * blockDim.x + threadIdx.x;
30 }

+ Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 36 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

36  {
37  return blockDim.x * gridDim.x;
38 }

+ Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS ( int64_t *  slot,
int64_t  new_val,
int64_t  init_val 
)

Definition at line 1115 of file cuda_mapd_rt.cu.

1117  {
1118  auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
1119  const auto empty_key =
1120  static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
1121  const auto new_val_cast =
1122  static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
1123 
1124  const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
1125  if (old_val == *empty_key) {
1126  return true;
1127  } else {
1128  return false;
1129  }
1130 }
__device__ bool slotEmptyKeyCAS_int16 ( int16_t *  slot,
int16_t  new_val,
int16_t  init_val 
)

Definition at line 1143 of file cuda_mapd_rt.cu.

1145  {
1146  unsigned int* base_slot_address =
1147  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1148  unsigned int old_value = *base_slot_address;
1149  unsigned int swap_value, compare_value;
1150  do {
1151  compare_value = old_value;
1152  // exit criteria: if init_val does not exist in the slot (some other thread has
1153  // succeeded)
1154  if (static_cast<unsigned int>(init_val) !=
1155  __byte_perm(
1156  compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
1157  return false;
1158  }
1159  swap_value = __byte_perm(compare_value,
1160  static_cast<unsigned int>(new_val),
1161  (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
1162  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1163  } while (compare_value != old_value);
1164  return true;
1165 }
__device__ bool slotEmptyKeyCAS_int32 ( int32_t *  slot,
int32_t  new_val,
int32_t  init_val 
)

Definition at line 1132 of file cuda_mapd_rt.cu.

1134  {
1135  unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
1136  unsigned int compare_value = static_cast<unsigned int>(init_val);
1137  unsigned int swap_value = static_cast<unsigned int>(new_val);
1138 
1139  const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
1140  return old_value == compare_value;
1141 }
__device__ bool slotEmptyKeyCAS_int8 ( int8_t *  slot,
int8_t  new_val,
int8_t  init_val 
)

Definition at line 1167 of file cuda_mapd_rt.cu.

1169  {
1170  // properly align the slot address:
1171  unsigned int* base_slot_address =
1172  reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
1173  constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
1174  unsigned int old_value = *base_slot_address;
1175  unsigned int swap_value, compare_value;
1176  do {
1177  compare_value = old_value;
1178  // exit criteria: if init_val does not exist in the slot (some other thread has
1179  // succeeded)
1180  if (static_cast<unsigned int>(init_val) !=
1181  __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
1182  return false;
1183  }
1184  swap_value = __byte_perm(compare_value,
1185  static_cast<unsigned int>(new_val),
1186  byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
1187  old_value = atomicCAS(base_slot_address, compare_value, swap_value);
1188  } while (compare_value != old_value);
1189  return true;
1190 }
__device__ uint64_t string_decode ( int8_t *  chunk_iter_,
int64_t  pos 
)

Definition at line 1204 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

1204  {
1205  // TODO(alex): de-dup, the x64 version is basically identical
1206  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
1207  VarlenDatum vd;
1208  bool is_end;
1209  ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
1210  return vd.is_null ? 0
1211  : (reinterpret_cast<uint64_t>(vd.pointer) & 0xffffffffffff) |
1212  (static_cast<uint64_t>(vd.length) << 48);
1213 }
bool is_null
Definition: sqltypes.h:153
DEVICE void ChunkIter_get_nth(ChunkIter *it, int n, bool uncompress, VarlenDatum *result, bool *is_end)
Definition: ChunkIter.cpp:181
int8_t * pointer
Definition: sqltypes.h:152
size_t length
Definition: sqltypes.h:151

+ Here is the call graph for this function:

__device__ void sync_threadblock ( )

Definition at line 1309 of file cuda_mapd_rt.cu.

Referenced by GpuSharedMemCodeBuilder::codegenInitialization(), and GpuSharedMemCodeBuilder::codegenReduction().

1309  {
1310  __syncthreads();
1311 }

+ Here is the caller graph for this function:

__device__ void sync_warp ( )

Definition at line 1290 of file cuda_mapd_rt.cu.

1290  {
1291  __syncwarp();
1292 }
__device__ void sync_warp_protected ( int64_t  thread_pos,
int64_t  row_count 
)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1301 of file cuda_mapd_rt.cu.

1301  {
1302  // only syncing if NOT within the same warp as those threads experiencing the critical
1303  // edge
1304  if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
1305  __syncwarp();
1306  }
1307 }
__device__ int8_t thread_warp_idx ( const int8_t  warp_sz)

Definition at line 40 of file cuda_mapd_rt.cu.

40  {
41  return threadIdx.x % warp_sz;
42 }
__device__ void write_back_non_grouped_agg ( int64_t *  input_buffer,
int64_t *  output_buffer,
const int32_t  agg_idx 
)

Definition at line 1321 of file cuda_mapd_rt.cu.

References agg_sum_shared().

1323  {
1324  if (threadIdx.x == agg_idx) {
1325  agg_sum_shared(output_buffer, input_buffer[agg_idx]);
1326  }
1327 }
__device__ int64_t agg_sum_shared(int64_t *agg, const int64_t val)

+ Here is the call graph for this function:

__device__ void write_back_nop ( int64_t *  dest,
int64_t *  src,
const int32_t  sz 
)

Definition at line 50 of file cuda_mapd_rt.cu.

50  {
51 }

Variable Documentation

__device__ int32_t dw_abort = 0

Definition at line 95 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), dynamic_watchdog_init(), and Executor::interrupt().

__device__ int64_t dw_cycle_budget = 0

Definition at line 94 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog(), and dynamic_watchdog_init().

__device__ int64_t dw_sm_cycle_start[128]

Definition at line 92 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

__device__ int32_t runtime_interrupt_flag = 0

Definition at line 96 of file cuda_mapd_rt.cu.

Referenced by check_interrupt(), check_interrupt_init(), and Executor::interrupt().