#include <cuda.h>
#include <float.h>
#include <stdint.h>
#include <stdio.h>
#include <limits>
#include "BufferCompaction.h"
#include "ExtensionFunctions.hpp"
#include "GpuRtConstants.h"
#include "HyperLogLogRank.h"
#include "GpuInitGroups.cu"
#include "GroupByRuntime.cpp"
#include "JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp"
#include "MurmurHash.cpp"
#include "TopKRuntime.cpp"
#include "../Utils/ChunkIter.cpp"
#include "DateTruncate.cpp"
#include "ExtractFromTime.cpp"
#include "ArrayOps.cpp"
#include "DateAdd.cpp"
#include "GeoOps.cpp"
#include "StringFunctions.cpp"
#include "../Utils/Regexp.cpp"
#include "../Utils/StringLike.cpp"

Include dependency graph for cuda_mapd_rt.cu:

Macros
#define	init_group_by_buffer_gpu_impl init_group_by_buffer_gpu

#define	DEF_AGG_ID_INT_SHARED(n)

#define	DEF_SKIP_AGG(base_agg_func)

#define	DATA_T int64_t

#define	ADDR_T uint64_t

#define	DATA_T int32_t

#define	ADDR_T uint32_t

#define	DEF_SKIP_AGG(base_agg_func)

#define	DATA_T double

#define	ADDR_T uint64_t

#define	DATA_T float

#define	ADDR_T uint32_t

#define	EXECUTE_INCLUDE

Functions
__device__ int64_t	get_thread_index ()

__device__ int64_t	get_block_index ()

__device__ int32_t	pos_start_impl (const int32_t *row_index_resume)

__device__ int32_t	group_buff_idx_impl ()

__device__ int32_t	pos_step_impl ()

__device__ int8_t	thread_warp_idx (const int8_t warp_sz)

__device__ const int64_t *	init_shared_mem_nop (const int64_t *groups_buffer, const int32_t groups_buffer_size)

__device__ void	write_back_nop (int64_t dest, int64_t src, const int32_t sz)

__device__ int64_t *	declare_dynamic_shared_memory ()

__device__ const int64_t *	init_shared_mem (const int64_t *global_groups_buffer, const int32_t groups_buffer_size)

__inline__ __device__ uint32_t	get_smid (void)

__device__ bool	dynamic_watchdog ()

__device__ bool	check_interrupt ()

template<typename T = unsigned long long>
__device__ T	get_empty_key ()

template<>
__device__ unsigned int	get_empty_key ()

template<typename T >
__device__ int64_t *	get_matching_group_value (int64_t groups_buffer, const uint32_t h, const T key, const uint32_t key_count, const uint32_t row_size_quad)

__device__ int64_t *	get_matching_group_value (int64_t groups_buffer, const uint32_t h, const int64_t key, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad)

template<typename T >
__device__ int32_t	get_matching_group_value_columnar_slot (int64_t groups_buffer, const uint32_t entry_count, const uint32_t h, const T key, const uint32_t key_count)

__device__ int32_t	get_matching_group_value_columnar_slot (int64_t groups_buffer, const uint32_t entry_count, const uint32_t h, const int64_t key, const uint32_t key_count, const uint32_t key_width)

__device__ int64_t *	get_matching_group_value_columnar (int64_t groups_buffer, const uint32_t h, const int64_t key, const uint32_t key_qw_count, const size_t entry_count)

__device__ int64_t	atomicMax64 (int64_t *address, int64_t val)

__device__ int64_t	atomicMin64 (int64_t *address, int64_t val)

__device__ double	atomicMax (double *address, double val)

__device__ float	atomicMax (float *address, float val)

__device__ double	atomicMin (double *address, double val)

__device__ double	atomicMin (float *address, float val)

__device__ uint64_t	agg_count_shared (uint64_t *agg, const int64_t val)

__device__ uint64_t	agg_count_if_shared (uint64_t *agg, const int64_t cond)

__device__ uint32_t	agg_count_int32_shared (uint32_t *agg, const int32_t val)

__device__ uint32_t	agg_count_if_int32_shared (uint32_t *agg, const int32_t cond)

__device__ uint64_t	agg_count_double_shared (uint64_t *agg, const double val)

__device__ uint32_t	agg_count_float_shared (uint32_t *agg, const float val)

__device__ int64_t	agg_sum_shared (int64_t *agg, const int64_t val)

__device__ int32_t	agg_sum_int32_shared (int32_t *agg, const int32_t val)

__device__ void	agg_sum_float_shared (int32_t *agg, const float val)

__device__ void	agg_sum_double_shared (int64_t *agg, const double val)

__device__ int64_t	agg_sum_if_shared (int64_t *agg, const int64_t val, const int8_t cond)

__device__ int32_t	agg_sum_if_int32_shared (int32_t *agg, const int32_t val, const int8_t cond)

__device__ void	agg_sum_if_float_shared (int32_t *agg, const float val, const int8_t cond)

__device__ void	agg_sum_if_double_shared (int64_t *agg, const double val, const int8_t cond)

__device__ void	agg_max_shared (int64_t *agg, const int64_t val)

__device__ void	agg_max_int32_shared (int32_t *agg, const int32_t val)

__device__ void	agg_max_double_shared (int64_t *agg, const double val)

__device__ void	agg_max_float_shared (int32_t *agg, const float val)

__device__ void	agg_min_shared (int64_t *agg, const int64_t val)

__device__ void	agg_min_int32_shared (int32_t *agg, const int32_t val)

__device__ void	atomicMax16 (int16_t *agg, const int16_t val)

__device__ void	atomicMax8 (int8_t *agg, const int8_t val)

__device__ void	atomicMin16 (int16_t *agg, const int16_t val)

__device__ void	atomicMin16SkipVal (int16_t *agg, const int16_t val, const int16_t skip_val)

__device__ void	atomicMin8 (int8_t *agg, const int8_t val)

__device__ void	atomicMin8SkipVal (int8_t *agg, const int8_t val, const int8_t skip_val)

__device__ void	agg_max_int16_shared (int16_t *agg, const int16_t val)

__device__ void	agg_max_int8_shared (int8_t *agg, const int8_t val)

__device__ void	agg_min_int16_shared (int16_t *agg, const int16_t val)

__device__ void	agg_min_int8_shared (int8_t *agg, const int8_t val)

__device__ void	agg_min_double_shared (int64_t *agg, const double val)

__device__ void	agg_min_float_shared (int32_t *agg, const float val)

__device__ void	agg_id_shared (int64_t *agg, const int64_t val)

__device__ int8_t *	agg_id_varlen_shared (int8_t varlen_buffer, const int64_t offset, const int8_t value, const int64_t size_bytes)

__device__ int32_t	checked_single_agg_id_shared (int64_t *agg, const int64_t val, const int64_t null_val)

__device__ void	agg_id_double_shared (int64_t *agg, const double val)

__device__ int32_t	checked_single_agg_id_double_shared (int64_t *agg, const double val, const double null_val)

__device__ void	agg_id_double_shared_slow (int64_t agg, const double val)

__device__ int32_t	checked_single_agg_id_double_shared_slow (int64_t agg, const double valp, const double null_val)

__device__ void	agg_id_float_shared (int32_t *agg, const float val)

__device__ int32_t	checked_single_agg_id_float_shared (int32_t *agg, const float val, const float null_val)

__device__ void	agg_max_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)

__device__ void	agg_max_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)

__device__ void	agg_min_int16_skip_val_shared (int16_t *agg, const int16_t val, const int16_t skip_val)

__device__ void	agg_max_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)

__device__ void	agg_min_int8_skip_val_shared (int8_t *agg, const int8_t val, const int8_t skip_val)

__device__ int32_t	atomicMin32SkipVal (int32_t *address, int32_t val, const int32_t skip_val)

__device__ void	agg_min_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)

__device__ int32_t	atomicSum32SkipVal (int32_t *address, const int32_t val, const int32_t skip_val)

__device__ int32_t	agg_sum_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val)

__device__ int32_t	agg_sum_if_int32_skip_val_shared (int32_t *agg, const int32_t val, const int32_t skip_val, const int8_t cond)

__device__ int64_t	atomicSum64SkipVal (int64_t *address, const int64_t val, const int64_t skip_val)

__device__ int64_t	agg_sum_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)

__device__ int64_t	agg_sum_if_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val, const int8_t cond)

__device__ int64_t	atomicMin64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)

__device__ void	agg_min_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)

__device__ int64_t	atomicMax64SkipVal (int64_t *address, int64_t val, const int64_t skip_val)

__device__ void	agg_max_skip_val_shared (int64_t *agg, const int64_t val, const int64_t skip_val)

__device__ void	agg_max_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)

__device__ float	atomicMinFltSkipVal (int32_t *address, float val, const float skip_val)

__device__ void	agg_min_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)

__device__ void	atomicSumFltSkipVal (float *address, const float val, const float skip_val)

__device__ void	agg_sum_float_skip_val_shared (int32_t *agg, const float val, const float skip_val)

__device__ void	agg_sum_if_float_skip_val_shared (int32_t *agg, const float val, const float skip_val, const int8_t cond)

__device__ void	atomicSumDblSkipVal (double *address, const double val, const double skip_val)

__device__ void	agg_sum_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)

__device__ void	agg_sum_if_double_skip_val_shared (int64_t *agg, const double val, const double skip_val, const int8_t cond)

__device__ double	atomicMinDblSkipVal (double *address, double val, const double skip_val)

__device__ void	agg_min_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)

__device__ void	agg_max_double_skip_val_shared (int64_t *agg, const double val, const double skip_val)

__device__ bool	slotEmptyKeyCAS (int64_t *slot, int64_t new_val, int64_t init_val)

__device__ bool	slotEmptyKeyCAS_int32 (int32_t *slot, int32_t new_val, int32_t init_val)

__device__ bool	slotEmptyKeyCAS_int16 (int16_t *slot, int16_t new_val, int16_t init_val)

__device__ bool	slotEmptyKeyCAS_int8 (int8_t *slot, int8_t new_val, int8_t init_val)

__device__ StringView	string_decode (int8_t *chunk_iter_, int64_t pos)

__device__ void	linear_probabilistic_count (uint8_t bitmap, const uint32_t bitmap_bytes, const uint8_t key_bytes, const uint32_t key_len)

__device__ void	agg_count_distinct_bitmap_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

__device__ void	agg_count_distinct_bitmap_skip_val_gpu (int64_t *agg, const int64_t val, const int64_t min_val, const int64_t bucket_size, const int64_t skip_val, const int64_t base_dev_addr, const int64_t base_host_addr, const uint64_t sub_bitmap_count, const uint64_t bitmap_bytes)

__device__ void	agg_approximate_count_distinct_gpu (int64_t *agg, const int64_t key, const uint32_t b, const int64_t base_dev_addr, const int64_t base_host_addr)

__device__ void	force_sync ()

__device__ void	sync_warp ()

__device__ void	sync_warp_protected (int64_t thread_pos, int64_t row_count)

__device__ void	sync_threadblock ()

__device__ void	write_back_non_grouped_agg (int64_t input_buffer, int64_t output_buffer, const int32_t agg_idx)

Variables
__device__ int64_t	dw_sm_cycle_start [128]

__device__ int64_t	dw_cycle_budget = 0

__device__ int32_t	dw_abort = 0

__device__ int32_t	runtime_interrupt_flag = 0

Macro Definition Documentation

#define ADDR_T uint64_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T uint32_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T uint64_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define ADDR_T uint32_t

Definition at line 1066 of file cuda_mapd_rt.cu.

#define DATA_T int64_t

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T int32_t

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T double

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DATA_T float

Definition at line 1065 of file cuda_mapd_rt.cu.

#define DEF_AGG_ID_INT_SHARED ( n )

Value:

extern "C" __device__ void agg_id_int##n##_shared(int##n##_t* agg,        \
                                                    const int##n##_t val) { \
    *agg = val;                                                             \
  }

Definition at line 762 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG ( base_agg_func )

Value:

extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
      ADDR_T* agg, const DATA_T val, const DATA_T skip_val) {   \
    if (val != skip_val) {                                      \
      return base_agg_func##_shared(agg, val);                  \
    }                                                           \
    return 0;                                                   \
  }

Definition at line 1050 of file cuda_mapd_rt.cu.

#define DEF_SKIP_AGG ( base_agg_func )

Value:

extern "C" __device__ ADDR_T base_agg_func##_skip_val_shared( \
      ADDR_T* agg, const DATA_T val, const DATA_T skip_val) {   \
    if (val != skip_val) {                                      \
      return base_agg_func##_shared(agg, val);                  \
    }                                                           \
    return *agg;                                                \
  }

Definition at line 1050 of file cuda_mapd_rt.cu.

#define EXECUTE_INCLUDE

Definition at line 1273 of file cuda_mapd_rt.cu.

#define init_group_by_buffer_gpu_impl init_group_by_buffer_gpu

Definition at line 82 of file cuda_mapd_rt.cu.

Function Documentation

__device__ void agg_approximate_count_distinct_gpu	(	int64_t *	agg,
		const int64_t	key,
		const uint32_t	b,
		const int64_t	base_dev_addr,
		const int64_t	base_host_addr
	)

Definition at line 1346 of file cuda_mapd_rt.cu.

References atomicMax(), get_rank(), and MurmurHash64A().

                                   {
   const uint64_t hash = MurmurHash64A(&key, sizeof(key), 0);
   const uint32_t index = hash >> (64 - b);
   const int32_t rank = get_rank(hash << b, 64 - b);
   const int64_t host_addr = *agg;
   int32_t* M = (int32_t*)(base_dev_addr + host_addr - base_host_addr);
   atomicMax(&M[index], rank);
 }

Here is the call graph for this function:

__device__ void agg_count_distinct_bitmap_gpu	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	min_val,
		const int64_t	bucket_size,
		const int64_t	base_dev_addr,
		const int64_t	base_host_addr,
		const uint64_t	sub_bitmap_count,
		const uint64_t	bitmap_bytes
	)

Definition at line 1303 of file cuda_mapd_rt.cu.

Referenced by agg_count_distinct_bitmap_skip_val_gpu().

                                                                                       {
   constexpr unsigned bitmap_element_size = 8 * sizeof(uint32_t);
   auto bitmap_idx = static_cast<uint64_t>(val - min_val);
   if (1 < bucket_size) {
     bitmap_idx /= static_cast<uint64_t>(bucket_size);
   }
   uint64_t const word_idx = bitmap_idx / bitmap_element_size;
   uint32_t const bit_idx = bitmap_idx % bitmap_element_size;
   int64_t const agg_offset = *agg - base_host_addr;
   int64_t const thread_offset = (threadIdx.x & (sub_bitmap_count - 1)) * bitmap_bytes;
   auto* bitmap = reinterpret_cast<uint32_t*>(base_dev_addr + agg_offset + thread_offset);
   atomicOr(bitmap + word_idx, 1u << bit_idx);
 }

Here is the caller graph for this function:

__device__ void agg_count_distinct_bitmap_skip_val_gpu	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	min_val,
		const int64_t	bucket_size,
		const int64_t	skip_val,
		const int64_t	base_dev_addr,
		const int64_t	base_host_addr,
		const uint64_t	sub_bitmap_count,
		const uint64_t	bitmap_bytes
	)

Definition at line 1324 of file cuda_mapd_rt.cu.

References agg_count_distinct_bitmap_gpu().

                                  {
   if (val != skip_val) {
     agg_count_distinct_bitmap_gpu(agg,
                                   val,
                                   min_val,
                                   bucket_size,
                                   base_dev_addr,
                                   base_host_addr,
                                   sub_bitmap_count,
                                   bitmap_bytes);
   }
 }

Here is the call graph for this function:

__device__ uint64_t agg_count_double_shared	(	uint64_t *	agg,
		const double	val
	)

Definition at line 448 of file cuda_mapd_rt.cu.

References agg_count_shared().

                                                                                         {
   return agg_count_shared(agg, val);
 }

Here is the call graph for this function:

__device__ uint32_t agg_count_float_shared	(	uint32_t *	agg,
		const float	val
	)

Definition at line 452 of file cuda_mapd_rt.cu.

References agg_count_int32_shared().

                                                                                       {
   return agg_count_int32_shared(agg, val);
 }

Here is the call graph for this function:

__device__ uint32_t agg_count_if_int32_shared	(	uint32_t *	agg,
		const int32_t	cond
	)

Definition at line 443 of file cuda_mapd_rt.cu.

                                                                              {
   return cond ? atomicAdd(agg, 1U) : *agg;
 }

__device__ uint64_t agg_count_if_shared	(	uint64_t *	agg,
		const int64_t	cond
	)

Definition at line 434 of file cuda_mapd_rt.cu.

                                                                                       {
   return cond ? static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U))
               : static_cast<uint64_t>(*(reinterpret_cast<uint32_t*>(agg)));
 }

__device__ uint32_t agg_count_int32_shared	(	uint32_t *	agg,
		const int32_t	val
	)

Definition at line 439 of file cuda_mapd_rt.cu.

Referenced by agg_count_float_shared().

                                                                                         {
   return atomicAdd(agg, 1U);
 }

Here is the caller graph for this function:

__device__ uint64_t agg_count_shared	(	uint64_t *	agg,
		const int64_t	val
	)

Definition at line 430 of file cuda_mapd_rt.cu.

Referenced by agg_count_double_shared().

                                                                                   {
   return static_cast<uint64_t>(atomicAdd(reinterpret_cast<uint32_t*>(agg), 1U));
 }

Here is the caller graph for this function:

__device__ void agg_id_double_shared	(	int64_t *	agg,
		const double	val
	)

Definition at line 774 of file cuda_mapd_rt.cu.

                                                                                 {
   *agg = *(reinterpret_cast<const int64_t*>(&val));
 }

__device__ void agg_id_double_shared_slow	(	int64_t *	agg,
		const double *	val
	)

Definition at line 805 of file cuda_mapd_rt.cu.

                                                                                       {
   *agg = *(reinterpret_cast<const int64_t*>(val));
 }

__device__ void agg_id_float_shared	(	int32_t *	agg,
		const float	val
	)

Definition at line 838 of file cuda_mapd_rt.cu.

                                                                               {
   *agg = __float_as_int(val);
 }

__device__ void agg_id_shared	(	int64_t *	agg,
		const int64_t	val
	)

Definition at line 721 of file cuda_mapd_rt.cu.

                                                                           {
   *agg = val;
 }

__device__ int8_t* agg_id_varlen_shared	(	int8_t *	varlen_buffer,
		const int64_t	offset,
		const int8_t *	value,
		const int64_t	size_bytes
	)

Definition at line 725 of file cuda_mapd_rt.cu.

                                                                              {
   for (auto i = 0; i < size_bytes; i++) {
     varlen_buffer[offset + i] = value[i];
   }
   return &varlen_buffer[offset];
 }

__device__ void agg_max_double_shared	(	int64_t *	agg,
		const double	val
	)

Definition at line 515 of file cuda_mapd_rt.cu.

References atomicMax().

                                                                                  {
   atomicMax(reinterpret_cast<double*>(agg), val);
 }

Here is the call graph for this function:

__device__ void agg_max_double_skip_val_shared	(	int64_t *	agg,
		const double	val,
		const double	skip_val
	)

Definition at line 1178 of file cuda_mapd_rt.cu.

References atomicMax().

                                                                                  {
   if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
     double old = __longlong_as_double(atomicExch(
         reinterpret_cast<unsigned long long int*>(agg), __double_as_longlong(-DBL_MAX)));
     atomicMax(reinterpret_cast<double*>(agg),
               __double_as_longlong(old) == __double_as_longlong(skip_val)
                   ? val
                   : fmax(old, val));
   }
 }

Here is the call graph for this function:

__device__ void agg_max_float_shared	(	int32_t *	agg,
		const float	val
	)

Definition at line 519 of file cuda_mapd_rt.cu.

References atomicMax().

                                                                                {
   atomicMax(reinterpret_cast<float*>(agg), val);
 }

Here is the call graph for this function:

__device__ void agg_max_float_skip_val_shared	(	int32_t *	agg,
		const float	val,
		const float	skip_val
	)

Definition at line 1072 of file cuda_mapd_rt.cu.

References atomicMax().

                                                                                {
   if (__float_as_int(val) != __float_as_int(skip_val)) {
     float old = atomicExch(reinterpret_cast<float*>(agg), -FLT_MAX);
     atomicMax(reinterpret_cast<float*>(agg),
               __float_as_int(old) == __float_as_int(skip_val) ? val : fmaxf(old, val));
   }
 }

Here is the call graph for this function:

__device__ void agg_max_int16_shared	(	int16_t *	agg,
		const int16_t	val
	)

Definition at line 697 of file cuda_mapd_rt.cu.

References atomicMax16().

Referenced by agg_max_int16_skip_val_shared().

                                                                                  {
   return atomicMax16(agg, val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ void agg_max_int16_skip_val_shared	(	int16_t *	agg,
		const int16_t	val,
		const int16_t	skip_val
	)

Definition at line 901 of file cuda_mapd_rt.cu.

References agg_max_int16_shared().

                                                                                  {
   if (val != skip_val) {
     agg_max_int16_shared(agg, val);
   }
 }

Here is the call graph for this function:

__device__ void agg_max_int32_shared	(	int32_t *	agg,
		const int32_t	val
	)

Definition at line 511 of file cuda_mapd_rt.cu.

References atomicMax().

Referenced by agg_max_int32_skip_val_shared().

                                                                                  {
   atomicMax(agg, val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ void agg_max_int32_skip_val_shared	(	int32_t *	agg,
		const int32_t	val,
		const int32_t	skip_val
	)

Definition at line 893 of file cuda_mapd_rt.cu.

References agg_max_int32_shared().

                                                                                  {
   if (val != skip_val) {
     agg_max_int32_shared(agg, val);
   }
 }

Here is the call graph for this function:

__device__ void agg_max_int8_shared	(	int8_t *	agg,
		const int8_t	val
	)

Definition at line 701 of file cuda_mapd_rt.cu.

References atomicMax8().

Referenced by agg_max_int8_skip_val_shared().

                                                                               {
   return atomicMax8(agg, val);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ void agg_max_int8_skip_val_shared	(	int8_t *	agg,
		const int8_t	val,
		const int8_t	skip_val
	)

Definition at line 917 of file cuda_mapd_rt.cu.

References agg_max_int8_shared().

                                                                                {
   if (val != skip_val) {
     agg_max_int8_shared(agg, val);
   }
 }

Here is the call graph for this function:

__device__ void agg_max_shared	(	int64_t *	agg,
		const int64_t	val
	)

Definition at line 507 of file cuda_mapd_rt.cu.

References atomicMax64().

                                                                            {
   atomicMax64(agg, val);
 }

Here is the call graph for this function:

__device__ void agg_max_skip_val_shared	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	skip_val
	)

Definition at line 1041 of file cuda_mapd_rt.cu.

References atomicMax64SkipVal().

                                                                            {
   if (val != skip_val) {
     atomicMax64SkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_double_shared	(	int64_t *	agg,
		const double	val
	)

Definition at line 713 of file cuda_mapd_rt.cu.

References atomicMin().

                                                                                  {
   atomicMin(reinterpret_cast<double*>(agg), val);
 }

Here is the call graph for this function:

__device__ void agg_min_double_skip_val_shared	(	int64_t *	agg,
		const double	val,
		const double	skip_val
	)

Definition at line 1170 of file cuda_mapd_rt.cu.

References atomicMinDblSkipVal().

                                                                                  {
   if (val != skip_val) {
     atomicMinDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_float_shared	(	int32_t *	agg,
		const float	val
	)

Definition at line 717 of file cuda_mapd_rt.cu.

References atomicMin().

                                                                                {
   atomicMin(reinterpret_cast<float*>(agg), val);
 }

Here is the call graph for this function:

__device__ void agg_min_float_skip_val_shared	(	int32_t *	agg,
		const float	val,
		const float	skip_val
	)

Definition at line 1089 of file cuda_mapd_rt.cu.

References atomicMinFltSkipVal().

                                                                                {
   if (__float_as_int(val) != __float_as_int(skip_val)) {
     atomicMinFltSkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_int16_shared	(	int16_t *	agg,
		const int16_t	val
	)

Definition at line 705 of file cuda_mapd_rt.cu.

References atomicMin16().

                                                                                  {
   return atomicMin16(agg, val);
 }

Here is the call graph for this function:

__device__ void agg_min_int16_skip_val_shared	(	int16_t *	agg,
		const int16_t	val,
		const int16_t	skip_val
	)

Definition at line 909 of file cuda_mapd_rt.cu.

References atomicMin16SkipVal().

                                                                                  {
   if (val != skip_val) {
     atomicMin16SkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_int32_shared	(	int32_t *	agg,
		const int32_t	val
	)

Definition at line 527 of file cuda_mapd_rt.cu.

References atomicMin().

                                                                                  {
   atomicMin(agg, val);
 }

Here is the call graph for this function:

__device__ void agg_min_int32_skip_val_shared	(	int32_t *	agg,
		const int32_t	val,
		const int32_t	skip_val
	)

Definition at line 940 of file cuda_mapd_rt.cu.

References atomicMin32SkipVal().

                                                                                  {
   if (val != skip_val) {
     atomicMin32SkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_int8_shared	(	int8_t *	agg,
		const int8_t	val
	)

Definition at line 709 of file cuda_mapd_rt.cu.

References atomicMin8().

                                                                               {
   return atomicMin8(agg, val);
 }

Here is the call graph for this function:

__device__ void agg_min_int8_skip_val_shared	(	int8_t *	agg,
		const int8_t	val,
		const int8_t	skip_val
	)

Definition at line 925 of file cuda_mapd_rt.cu.

References atomicMin8SkipVal().

                                                                                {
   if (val != skip_val) {
     atomicMin8SkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_min_shared	(	int64_t *	agg,
		const int64_t	val
	)

Definition at line 523 of file cuda_mapd_rt.cu.

References atomicMin64().

                                                                            {
   atomicMin64(agg, val);
 }

Here is the call graph for this function:

__device__ void agg_min_skip_val_shared	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	skip_val
	)

Definition at line 1016 of file cuda_mapd_rt.cu.

References atomicMin64SkipVal().

                                                                            {
   if (val != skip_val) {
     atomicMin64SkipVal(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_sum_double_shared	(	int64_t *	agg,
		const double	val
	)

Definition at line 468 of file cuda_mapd_rt.cu.

                                                                                  {
   atomicAdd(reinterpret_cast<double*>(agg), val);
 }

__device__ void agg_sum_double_skip_val_shared	(	int64_t *	agg,
		const double	val,
		const double	skip_val
	)

Definition at line 1131 of file cuda_mapd_rt.cu.

References atomicSumDblSkipVal().

Referenced by agg_sum_if_double_skip_val_shared().

                                                                                  {
   if (__double_as_longlong(val) != __double_as_longlong(skip_val)) {
     atomicSumDblSkipVal(reinterpret_cast<double*>(agg), val, skip_val);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ void agg_sum_float_shared	(	int32_t *	agg,
		const float	val
	)

Definition at line 464 of file cuda_mapd_rt.cu.

                                                                                {
   atomicAdd(reinterpret_cast<float*>(agg), val);
 }

__device__ void agg_sum_float_skip_val_shared	(	int32_t *	agg,
		const float	val,
		const float	skip_val
	)

Definition at line 1104 of file cuda_mapd_rt.cu.

References atomicSumFltSkipVal().

Referenced by agg_sum_if_float_skip_val_shared().

                                                                                {
   if (__float_as_int(val) != __float_as_int(skip_val)) {
     atomicSumFltSkipVal(reinterpret_cast<float*>(agg), val, skip_val);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ void agg_sum_if_double_shared	(	int64_t *	agg,
		const double	val,
		const int8_t	cond
	)

Definition at line 499 of file cuda_mapd_rt.cu.

                                                                        {
   if (cond) {
     atomicAdd(reinterpret_cast<double*>(agg), val);
   }
 }

__device__ void agg_sum_if_double_skip_val_shared	(	int64_t *	agg,
		const double	val,
		const double	skip_val,
		const int8_t	cond
	)

Definition at line 1139 of file cuda_mapd_rt.cu.

References agg_sum_double_skip_val_shared().

                                                                                 {
   if (cond) {
     agg_sum_double_skip_val_shared(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ void agg_sum_if_float_shared	(	int32_t *	agg,
		const float	val,
		const int8_t	cond
	)

Definition at line 491 of file cuda_mapd_rt.cu.

                                                                       {
   if (cond) {
     atomicAdd(reinterpret_cast<float*>(agg), val);
   }
 }

__device__ void agg_sum_if_float_skip_val_shared	(	int32_t *	agg,
		const float	val,
		const float	skip_val,
		const int8_t	cond
	)

Definition at line 1112 of file cuda_mapd_rt.cu.

References agg_sum_float_skip_val_shared().

                                                                                {
   if (cond) {
     agg_sum_float_skip_val_shared(agg, val, skip_val);
   }
 }

Here is the call graph for this function:

__device__ int32_t agg_sum_if_int32_shared	(	int32_t *	agg,
		const int32_t	val,
		const int8_t	cond
	)

Definition at line 482 of file cuda_mapd_rt.cu.

                                                                          {
   if (cond) {
     return atomicAdd(agg, val);
   }
   return *agg;
 }

__device__ int32_t agg_sum_if_int32_skip_val_shared	(	int32_t *	agg,
		const int32_t	val,
		const int32_t	skip_val,
		const int8_t	cond
	)

Definition at line 967 of file cuda_mapd_rt.cu.

References agg_sum_int32_skip_val_shared().

                                                                                   {
   return cond ? agg_sum_int32_skip_val_shared(agg, val, skip_val) : *agg;
 }

Here is the call graph for this function:

__device__ int64_t agg_sum_if_shared	(	int64_t *	agg,
		const int64_t	val,
		const int8_t	cond
	)

Definition at line 472 of file cuda_mapd_rt.cu.

                                                                    {
   static_assert(sizeof(int64_t) == sizeof(unsigned long long));
   if (cond) {
     return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
   }
   return *agg;
 }

__device__ int64_t agg_sum_if_skip_val_shared	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	skip_val,
		const int8_t	cond
	)

Definition at line 992 of file cuda_mapd_rt.cu.

References agg_sum_skip_val_shared().

                                                                             {
   return cond ? agg_sum_skip_val_shared(agg, val, skip_val) : *agg;
 }

Here is the call graph for this function:

__device__ int32_t agg_sum_int32_shared	(	int32_t *	agg,
		const int32_t	val
	)

Definition at line 460 of file cuda_mapd_rt.cu.

                                                                                     {
   return atomicAdd(agg, val);
 }

__device__ int32_t agg_sum_int32_skip_val_shared	(	int32_t *	agg,
		const int32_t	val,
		const int32_t	skip_val
	)

Definition at line 957 of file cuda_mapd_rt.cu.

References atomicSum32SkipVal().

Referenced by agg_sum_if_int32_skip_val_shared().

                                                                                     {
   if (val != skip_val) {
     const int32_t old = atomicSum32SkipVal(agg, val, skip_val);
     return old;
   }
   return 0;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ int64_t agg_sum_shared	(	int64_t *	agg,
		const int64_t	val
	)

Definition at line 456 of file cuda_mapd_rt.cu.

Referenced by write_back_non_grouped_agg().

                                                                               {
   return atomicAdd(reinterpret_cast<unsigned long long*>(agg), val);
 }

Here is the caller graph for this function:

__device__ int64_t agg_sum_skip_val_shared	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	skip_val
	)

Definition at line 983 of file cuda_mapd_rt.cu.

References atomicSum64SkipVal().

Referenced by agg_sum_if_skip_val_shared().

                                                                               {
   if (val != skip_val) {
     return atomicSum64SkipVal(agg, val, skip_val);
   }
   return 0;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ double atomicMax	(	double *	address,
		double	val
	)

Definition at line 372 of file cuda_mapd_rt.cu.

Referenced by agg_approximate_count_distinct_gpu(), agg_max_double_shared(), agg_max_double_skip_val_shared(), agg_max_float_shared(), agg_max_float_skip_val_shared(), agg_max_int32_shared(), and approximate_distinct_tuples_impl().

                                                          {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull,
                     assumed,
                     __double_as_longlong(max(val, __longlong_as_double(assumed))));
 
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   } while (assumed != old);
 
   return __longlong_as_double(old);
 }

Here is the caller graph for this function:

__device__ float atomicMax	(	float *	address,
		float	val
	)

Definition at line 388 of file cuda_mapd_rt.cu.

                                                       {
   int* address_as_int = (int*)address;
   int old = *address_as_int, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(
         address_as_int, assumed, __float_as_int(max(val, __int_as_float(assumed))));
 
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   } while (assumed != old);
 
   return __int_as_float(old);
 }

__device__ void atomicMax16	(	int16_t *	agg,
		const int16_t	val
	)

Definition at line 545 of file cuda_mapd_rt.cu.

Referenced by agg_max_int16_shared().

                                                              {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     swap_value =
         (reinterpret_cast<size_t>(agg) & 0x2)
             ? static_cast<unsigned int>(max(static_cast<int16_t>(old_value >> 16), val))
                       << 16 |
                   (old_value & 0xFFFF)
             : (old_value & 0xFFFF0000) |
                   static_cast<unsigned int>(
                       max(static_cast<int16_t>(old_value & 0xFFFF), val));
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (old_value != compare_value);
 }

Here is the caller graph for this function:

__device__ int64_t atomicMax64	(	int64_t *	address,
		int64_t	val
	)

Definition at line 330 of file cuda_mapd_rt.cu.

Referenced by agg_max_shared().

                                                               {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, max((long long)val, (long long)assumed));
   } while (assumed != old);
 
   return old;
 }

Here is the caller graph for this function:

__device__ int64_t atomicMax64SkipVal	(	int64_t *	address,
		int64_t	val,
		const int64_t	skip_val
	)

Definition at line 1024 of file cuda_mapd_rt.cu.

Referenced by agg_max_skip_val_shared().

                                                               {
   unsigned long long int* address_as_ull =
       reinterpret_cast<unsigned long long int*>(address);
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull,
                     assumed,
                     assumed == skip_val ? val : max((long long)val, (long long)assumed));
   } while (assumed != old);
 
   return old;
 }

Here is the caller graph for this function:

__device__ void atomicMax8	(	int8_t *	agg,
		const int8_t	val
	)

Definition at line 567 of file cuda_mapd_rt.cu.

Referenced by agg_max_int8_shared().

                                                           {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   // __byte_perm(unsigned int A, unsigned int B, unsigned int s):
   // if s == 0x3214 returns {A[31..24], A[23..16], A[15..8], B[7..0]}
   // if s == 0x3240 returns {A[31..24], A[23..16], B[7...0], A[7..0]}
   // if s == 0x3410 returns {A[31..24], B[7....0], A[15..8], A[7..0]}
   // if s == 0x4210 returns {B[7....0], A[23..16], A[15..8], A[7..0]}
   constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     auto max_value = static_cast<unsigned int>(
         // compare val with its corresponding bits in the compare_value
         max(val,
             static_cast<int8_t>(__byte_perm(
                 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
     swap_value = __byte_perm(
         compare_value, max_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (compare_value != old_value);
 }

Here is the caller graph for this function:

__device__ double atomicMin	(	double *	address,
		double	val
	)

Definition at line 403 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_shared(), agg_min_float_shared(), agg_min_int32_shared(), atomicMin32SkipVal(), atomicMinFltSkipVal(), and compute_bucket_sizes_impl().

                                                          {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull,
                     assumed,
                     __double_as_longlong(min(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 
   return __longlong_as_double(old);
 }

Here is the caller graph for this function:

__device__ double atomicMin	(	float *	address,
		float	val
	)

Definition at line 417 of file cuda_mapd_rt.cu.

                                                        {
   int* address_as_ull = (int*)address;
   int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(
         address_as_ull, assumed, __float_as_int(min(val, __int_as_float(assumed))));
   } while (assumed != old);
 
   return __int_as_float(old);
 }

__device__ void atomicMin16	(	int16_t *	agg,
		const int16_t	val
	)

Definition at line 607 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_shared().

                                                              {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     swap_value =
         (reinterpret_cast<size_t>(agg) & 0x2)
             ? static_cast<unsigned int>(min(static_cast<int16_t>(old_value >> 16), val))
                       << 16 |
                   (old_value & 0xFFFF)
             : (old_value & 0xFFFF0000) |
                   static_cast<unsigned int>(
                       min(static_cast<int16_t>(old_value & 0xFFFF), val));
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (old_value != compare_value);
 }

Here is the caller graph for this function:

__device__ void atomicMin16SkipVal	(	int16_t *	agg,
		const int16_t	val,
		const int16_t	skip_val
	)

Definition at line 629 of file cuda_mapd_rt.cu.

Referenced by agg_min_int16_skip_val_shared().

                                                            {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     int16_t selected_old_val = (reinterpret_cast<size_t>(agg) & 0x2)
                                    ? static_cast<int16_t>(old_value >> 16)
                                    : static_cast<int16_t>(old_value & 0xFFFF);
 
     swap_value =
         (reinterpret_cast<size_t>(agg) & 0x2)
             ? static_cast<unsigned int>(
                   selected_old_val == skip_val ? val : min(selected_old_val, val))
                       << 16 |
                   (old_value & 0xFFFF)
             : (old_value & 0xFFFF0000) |
                   static_cast<unsigned int>(
                       selected_old_val == skip_val ? val : min(selected_old_val, val));
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (old_value != compare_value);
 }

Here is the caller graph for this function:

__device__ int32_t atomicMin32SkipVal	(	int32_t *	address,
		int32_t	val,
		const int32_t	skip_val
	)

Definition at line 933 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_int32_skip_val_shared().

                                                               {
   int32_t old = atomicExch(address, INT_MAX);
   return atomicMin(address, old == skip_val ? val : min(old, val));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ int64_t atomicMin64	(	int64_t *	address,
		int64_t	val
	)

Definition at line 342 of file cuda_mapd_rt.cu.

Referenced by agg_min_shared().

                                                               {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, min((long long)val, (long long)assumed));
   } while (assumed != old);
 
   return old;
 }

Here is the caller graph for this function:

__device__ int64_t atomicMin64SkipVal	(	int64_t *	address,
		int64_t	val,
		const int64_t	skip_val
	)

Definition at line 999 of file cuda_mapd_rt.cu.

Referenced by agg_min_skip_val_shared().

                                                               {
   unsigned long long int* address_as_ull =
       reinterpret_cast<unsigned long long int*>(address);
   unsigned long long int old = *address_as_ull, assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull,
                     assumed,
                     assumed == skip_val ? val : min((long long)val, (long long)assumed));
   } while (assumed != old);
 
   return old;
 }

Here is the caller graph for this function:

__device__ void atomicMin8	(	int8_t *	agg,
		const int8_t	val
	)

Definition at line 657 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_shared().

                                                           {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     auto min_value = static_cast<unsigned int>(
         min(val,
             static_cast<int8_t>(__byte_perm(
                 compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440))));
     swap_value = __byte_perm(
         compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (compare_value != old_value);
 }

Here is the caller graph for this function:

__device__ void atomicMin8SkipVal	(	int8_t *	agg,
		const int8_t	val,
		const int8_t	skip_val
	)

Definition at line 677 of file cuda_mapd_rt.cu.

Referenced by agg_min_int8_skip_val_shared().

                                                                                         {
   // properly align the input pointer:
   unsigned int* base_address_u32 =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(agg) & ~0x3);
 
   constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
   unsigned int old_value = *base_address_u32;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     int8_t selected_old_val = static_cast<int8_t>(
         __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(agg) & 0x3) | 0x4440));
     auto min_value = static_cast<unsigned int>(
         selected_old_val == skip_val ? val : min(val, selected_old_val));
     swap_value = __byte_perm(
         compare_value, min_value, byte_permutations[reinterpret_cast<size_t>(agg) & 0x3]);
     old_value = atomicCAS(base_address_u32, compare_value, swap_value);
   } while (compare_value != old_value);
 }

Here is the caller graph for this function:

__device__ double atomicMinDblSkipVal	(	double *	address,
		double	val,
		const double	skip_val
	)

Definition at line 1148 of file cuda_mapd_rt.cu.

Referenced by agg_min_double_skip_val_shared().

                                                              {
   unsigned long long int* address_as_ull =
       reinterpret_cast<unsigned long long int*>(address);
   unsigned long long int old = *address_as_ull;
   unsigned long long int skip_val_as_ull =
       *reinterpret_cast<const unsigned long long*>(&skip_val);
   unsigned long long int assumed;
 
   do {
     assumed = old;
     old = atomicCAS(address_as_ull,
                     assumed,
                     assumed == skip_val_as_ull
                         ? *reinterpret_cast<unsigned long long*>(&val)
                         : __double_as_longlong(min(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 
   return __longlong_as_double(old);
 }

Here is the caller graph for this function:

__device__ float atomicMinFltSkipVal	(	int32_t *	address,
		float	val,
		const float	skip_val
	)

Definition at line 1082 of file cuda_mapd_rt.cu.

References atomicMin().

Referenced by agg_min_float_skip_val_shared().

                                                                                         {
   float old = atomicExch(reinterpret_cast<float*>(address), FLT_MAX);
   return atomicMin(
       reinterpret_cast<float*>(address),
       __float_as_int(old) == __float_as_int(skip_val) ? val : fminf(old, val));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ int32_t atomicSum32SkipVal	(	int32_t *	address,
		const int32_t	val,
		const int32_t	skip_val
	)

Definition at line 948 of file cuda_mapd_rt.cu.

Referenced by agg_sum_int32_skip_val_shared().

                                                               {
   unsigned int* address_as_int = (unsigned int*)address;
   int32_t old = atomicExch(address_as_int, 0);
   int32_t old2 = atomicAdd(address_as_int, old == skip_val ? val : (val + old));
   return old == skip_val ? old2 : (old2 + old);
 }

Here is the caller graph for this function:

__device__ int64_t atomicSum64SkipVal	(	int64_t *	address,
		const int64_t	val,
		const int64_t	skip_val
	)

Definition at line 974 of file cuda_mapd_rt.cu.

Referenced by agg_sum_skip_val_shared().

                                                               {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   int64_t old = atomicExch(address_as_ull, 0);
   int64_t old2 = atomicAdd(address_as_ull, old == skip_val ? val : (val + old));
   return old == skip_val ? old2 : (old2 + old);
 }

Here is the caller graph for this function:

__device__ void atomicSumDblSkipVal	(	double *	address,
		const double	val,
		const double	skip_val
	)

Definition at line 1121 of file cuda_mapd_rt.cu.

Referenced by agg_sum_double_skip_val_shared().

                                                            {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   double old = __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(0.)));
   atomicAdd(
       address,
       __double_as_longlong(old) == __double_as_longlong(skip_val) ? val : (val + old));
 }

Here is the caller graph for this function:

__device__ void atomicSumFltSkipVal	(	float *	address,
		const float	val,
		const float	skip_val
	)

Definition at line 1097 of file cuda_mapd_rt.cu.

References f().

Referenced by agg_sum_float_skip_val_shared().

                                                           {
   float old = atomicExch(address, 0.f);
   atomicAdd(address, __float_as_int(old) == __float_as_int(skip_val) ? val : (val + old));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ bool check_interrupt ( )

Definition at line 159 of file cuda_mapd_rt.cu.

Referenced by check_interrupt_rt(), and ColumnFetcher::linearizeFixedLenArrayColFrags().

                                              {
   return (runtime_interrupt_flag == 1) ? true : false;
 }

Here is the caller graph for this function:

__device__ int32_t checked_single_agg_id_double_shared	(	int64_t *	agg,
		const double	val,
		const double	null_val
	)

Definition at line 778 of file cuda_mapd_rt.cu.

                                                                                          {
   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
   unsigned long long int old = *address_as_ull, assumed;
 
   if (val == null_val) {
     return 0;
   }
 
   do {
     if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
       if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
         // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
         return 15;
       } else {
         break;
       }
     }
 
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
 
   return 0;
 }

__device__ int32_t checked_single_agg_id_double_shared_slow	(	int64_t *	agg,
		const double *	valp,
		const double	null_val
	)

Definition at line 810 of file cuda_mapd_rt.cu.

                                                                 {
   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
   unsigned long long int old = *address_as_ull, assumed;
   double val = *valp;
 
   if (val == null_val) {
     return 0;
   }
 
   do {
     if (static_cast<int64_t>(old) != __double_as_longlong(null_val)) {
       if (static_cast<int64_t>(old) != __double_as_longlong(val)) {
         // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
         return 15;
       } else {
         break;
       }
     }
 
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
 
   return 0;
 }

__device__ int32_t checked_single_agg_id_float_shared	(	int32_t *	agg,
		const float	val,
		const float	null_val
	)

Definition at line 842 of file cuda_mapd_rt.cu.

                                                                                        {
   int* address_as_ull = reinterpret_cast<int*>(agg);
   int old = *address_as_ull, assumed;
 
   if (val == null_val) {
     return 0;
   }
 
   do {
     if (old != __float_as_int(null_val)) {
       if (old != __float_as_int(val)) {
         // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
         return 15;
       } else {
         break;
       }
     }
 
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, __float_as_int(val));
   } while (assumed != old);
 
   return 0;
 }

__device__ int32_t checked_single_agg_id_shared	(	int64_t *	agg,
		const int64_t	val,
		const int64_t	null_val
	)

Definition at line 735 of file cuda_mapd_rt.cu.

                                                                                    {
   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(agg);
   unsigned long long int old = *address_as_ull, assumed;
 
   if (val == null_val) {
     return 0;
   }
 
   do {
     if (static_cast<int64_t>(old) != null_val) {
       if (static_cast<int64_t>(old) != val) {
         // see Execute::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES
         return 15;
       } else {
         break;
       }
     }
 
     assumed = old;
     old = atomicCAS(address_as_ull, assumed, val);
   } while (assumed != old);
 
   return 0;
 }

__device__ int64_t* declare_dynamic_shared_memory ( )

Definition at line 56 of file cuda_mapd_rt.cu.

                                                                {
   extern __shared__ int64_t shared_mem_buffer[];
   return shared_mem_buffer;
 }

__device__ bool dynamic_watchdog ( )

Definition at line 115 of file cuda_mapd_rt.cu.

Referenced by anonymous_namespace{ResultSetReduction.cpp}::check_watchdog(), check_watchdog_rt(), anonymous_namespace{ResultSetReduction.cpp}::check_watchdog_with_seed(), get_group_value_columnar_slot_with_watchdog(), get_group_value_columnar_with_watchdog(), and get_group_value_with_watchdog().

                                               {
   // check for dynamic watchdog, if triggered all threads return true
   if (dw_cycle_budget == 0LL) {
     return false;  // Uninitialized watchdog can't check time
   }
   if (dw_abort == 1) {
     return true;  // Received host request to abort
   }
   uint32_t smid = get_smid();
   if (smid >= 128) {
     return false;
   }
   __shared__ volatile int64_t dw_block_cycle_start;  // Thread block shared cycle start
   __shared__ volatile bool
       dw_should_terminate;  // all threads within a block should return together if
                             // watchdog criteria is met
 
   // thread 0 either initializes or read the initial clock cycle, the result is stored
   // into shared memory. Since all threads wihtin a block shares the same SM, there's no
   // point in using more threads here.
   if (threadIdx.x == 0) {
     dw_block_cycle_start = 0LL;
     int64_t cycle_count = static_cast<int64_t>(clock64());
     // Make sure the block hasn't switched SMs
     if (smid == get_smid()) {
       dw_block_cycle_start = static_cast<int64_t>(
           atomicCAS(reinterpret_cast<unsigned long long*>(&dw_sm_cycle_start[smid]),
                     0ULL,
                     static_cast<unsigned long long>(cycle_count)));
     }
 
     int64_t cycles = cycle_count - dw_block_cycle_start;
     if ((smid == get_smid()) && (dw_block_cycle_start > 0LL) &&
         (cycles > dw_cycle_budget)) {
       // Check if we're out of time on this particular SM
       dw_should_terminate = true;
     } else {
       dw_should_terminate = false;
     }
   }
   __syncthreads();
   return dw_should_terminate;
 }

Here is the caller graph for this function:

__device__ void force_sync ( )

Definition at line 1360 of file cuda_mapd_rt.cu.

                                         {
   __threadfence_block();
 }

__device__ int64_t get_block_index ( )

Definition at line 23 of file cuda_mapd_rt.cu.

                                                 {
   return blockIdx.x;
 }

template<typename T = unsigned long long>

__device__ T get_empty_key ( )

inline

Definition at line 164 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

                                     {
   return EMPTY_KEY_64;
 }

template<>

__device__ unsigned int get_empty_key ( )

inline

Definition at line 169 of file cuda_mapd_rt.cu.

References EMPTY_KEY_32.

                                                {
   return EMPTY_KEY_32;
 }

template<typename T >

__device__ int64_t* get_matching_group_value	(	int64_t *	groups_buffer,
		const uint32_t	h,
		const T *	key,
		const uint32_t	key_count,
		const uint32_t	row_size_quad
	)

inline

Definition at line 174 of file cuda_mapd_rt.cu.

References align_to_int64(), and heavydb.dtypes::T.

Referenced by get_group_value(), get_group_value_with_watchdog(), and get_matching_group_value().

                                                                                   {
   const T empty_key = get_empty_key<T>();
   uint32_t off = h * row_size_quad;
   auto row_ptr = reinterpret_cast<T*>(groups_buffer + off);
   {
     const T old = atomicCAS(row_ptr, empty_key, *key);
     if (empty_key == old && key_count > 1) {
       for (size_t i = 1; i <= key_count - 1; ++i) {
         atomicExch(row_ptr + i, key[i]);
       }
     }
   }
   if (key_count > 1) {
     while (atomicAdd(row_ptr + key_count - 1, 0) == empty_key) {
       // spin until the winning thread has finished writing the entire key and the init
       // value
     }
   }
   bool match = true;
   for (uint32_t i = 0; i < key_count; ++i) {
     if (row_ptr[i] != key[i]) {
       match = false;
       break;
     }
   }
 
   if (match) {
     auto row_ptr_i8 = reinterpret_cast<int8_t*>(row_ptr + key_count);
     return reinterpret_cast<int64_t*>(align_to_int64(row_ptr_i8));
   }
   return NULL;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

__device__ int64_t* get_matching_group_value	(	int64_t *	groups_buffer,
		const uint32_t	h,
		const int64_t *	key,
		const uint32_t	key_count,
		const uint32_t	key_width,
		const uint32_t	row_size_quad
	)

Definition at line 211 of file cuda_mapd_rt.cu.

References get_matching_group_value().

                                                                                       {
   switch (key_width) {
     case 4:
       return get_matching_group_value(groups_buffer,
                                       h,
                                       reinterpret_cast<const unsigned int*>(key),
                                       key_count,
                                       row_size_quad);
     case 8:
       return get_matching_group_value(groups_buffer,
                                       h,
                                       reinterpret_cast<const unsigned long long*>(key),
                                       key_count,
                                       row_size_quad);
     default:
       return NULL;
   }
 }

Here is the call graph for this function:

__device__ int64_t* get_matching_group_value_columnar	(	int64_t *	groups_buffer,
		const uint32_t	h,
		const int64_t *	key,
		const uint32_t	key_qw_count,
		const size_t	entry_count
	)

Definition at line 296 of file cuda_mapd_rt.cu.

References EMPTY_KEY_64.

Referenced by get_group_value_columnar(), and get_group_value_columnar_with_watchdog().

                               {
   uint32_t off = h;
   {
     const uint64_t old = atomicCAS(
         reinterpret_cast<unsigned long long*>(groups_buffer + off), EMPTY_KEY_64, *key);
     if (EMPTY_KEY_64 == old) {
       for (size_t i = 0; i < key_qw_count; ++i) {
         groups_buffer[off] = key[i];
         off += entry_count;
       }
       return &groups_buffer[off];
     }
   }
   __syncthreads();
   off = h;
   for (size_t i = 0; i < key_qw_count; ++i) {
     if (groups_buffer[off] != key[i]) {
       return NULL;
     }
     off += entry_count;
   }
   return &groups_buffer[off];
 }

Here is the caller graph for this function:

template<typename T >

__device__ int32_t get_matching_group_value_columnar_slot	(	int64_t *	groups_buffer,
		const uint32_t	entry_count,
		const uint32_t	h,
		const T *	key,
		const uint32_t	key_count
	)

Definition at line 236 of file cuda_mapd_rt.cu.

References heavydb.dtypes::T.

Referenced by get_group_value_columnar_slot(), get_group_value_columnar_slot_with_watchdog(), and get_matching_group_value_columnar_slot().

                                                                                     {
   const T empty_key = get_empty_key<T>();
   const uint64_t old =
       atomicCAS(reinterpret_cast<T*>(groups_buffer + h), empty_key, *key);
   // the winner thread proceeds with writing the rest fo the keys
   if (old == empty_key) {
     uint32_t offset = h + entry_count;
     for (size_t i = 1; i < key_count; ++i) {
       *reinterpret_cast<T*>(groups_buffer + offset) = key[i];
       offset += entry_count;
     }
   }
 
   __threadfence();
   // for all threads except the winning thread, memory content of the keys
   // related to the hash offset are checked again. In case of a complete match
   // the hash offset is returned, otherwise -1 is returned
   if (old != empty_key) {
     uint32_t offset = h;
     for (uint32_t i = 0; i < key_count; ++i) {
       if (*reinterpret_cast<T*>(groups_buffer + offset) != key[i]) {
         return -1;
       }
       offset += entry_count;
     }
   }
   return h;
 }

Here is the caller graph for this function:

__device__ int32_t get_matching_group_value_columnar_slot	(	int64_t *	groups_buffer,
		const uint32_t	entry_count,
		const uint32_t	h,
		const int64_t *	key,
		const uint32_t	key_count,
		const uint32_t	key_width
	)

Definition at line 270 of file cuda_mapd_rt.cu.

References get_matching_group_value_columnar_slot().

                                                                  {
   switch (key_width) {
     case 4:
       return get_matching_group_value_columnar_slot(
           groups_buffer,
           entry_count,
           h,
           reinterpret_cast<const unsigned int*>(key),
           key_count);
     case 8:
       return get_matching_group_value_columnar_slot(
           groups_buffer,
           entry_count,
           h,
           reinterpret_cast<const unsigned long long*>(key),
           key_count);
     default:
       return -1;
   }
 }

Here is the call graph for this function:

__inline__ __device__ uint32_t get_smid ( void )

Definition at line 97 of file cuda_mapd_rt.cu.

Referenced by dynamic_watchdog().

                                               {
   uint32_t ret;
   asm("mov.u32 %0, %%smid;" : "=r"(ret));
   return ret;
 }

Here is the caller graph for this function:

__device__ int64_t get_thread_index ( )

Definition at line 19 of file cuda_mapd_rt.cu.

                                                  {
   return threadIdx.x;
 }

__device__ int32_t group_buff_idx_impl ( )

Definition at line 31 of file cuda_mapd_rt.cu.

References pos_start_impl().

                                                     {
   return pos_start_impl(NULL);
 }

Here is the call graph for this function:

__device__ const int64_t* init_shared_mem	(	const int64_t *	global_groups_buffer,
		const int32_t	groups_buffer_size
	)

Initializes the shared memory buffer for perfect hash group by. In this function, we simply copy the global group by buffer (already initialized on the host and transferred) to all shared memory group by buffers.

Definition at line 66 of file cuda_mapd_rt.cu.

                                                                                        {
   // dynamic shared memory declaration
   extern __shared__ int64_t shared_groups_buffer[];
 
   // it is assumed that buffer size is aligned with 64-bit units
   // so it is safe to assign 64-bit to each thread
   const int32_t buffer_units = groups_buffer_size >> 3;
 
   for (int32_t pos = threadIdx.x; pos < buffer_units; pos += blockDim.x) {
     shared_groups_buffer[pos] = global_groups_buffer[pos];
   }
   __syncthreads();
   return shared_groups_buffer;
 }

__device__ const int64_t* init_shared_mem_nop	(	const int64_t *	groups_buffer,
		const int32_t	groups_buffer_size
	)

Definition at line 43 of file cuda_mapd_rt.cu.

                                       {
   return groups_buffer;
 }

__device__ void linear_probabilistic_count	(	uint8_t *	bitmap,
		const uint32_t	bitmap_bytes,
		const uint8_t *	key_bytes,
		const uint32_t	key_len
	)

Definition at line 1293 of file cuda_mapd_rt.cu.

                                                                               {
   const uint32_t bit_pos = MurmurHash3(key_bytes, key_len, 0) % (bitmap_bytes * 8);
   const uint32_t word_idx = bit_pos / 32;
   const uint32_t bit_idx = bit_pos % 32;
   atomicOr(((uint32_t*)bitmap) + word_idx, 1 << bit_idx);
 }

__device__ int32_t pos_start_impl ( const int32_t * row_index_resume )

Definition at line 27 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl(), get_error_code(), group_buff_idx_impl(), and record_error_code().

                                                                               {
   return blockIdx.x * blockDim.x + threadIdx.x;
 }

Here is the caller graph for this function:

__device__ int32_t pos_step_impl ( )

Definition at line 35 of file cuda_mapd_rt.cu.

Referenced by get_bin_from_k_heap_impl().

                                               {
   return blockDim.x * gridDim.x;
 }

Here is the caller graph for this function:

__device__ bool slotEmptyKeyCAS	(	int64_t *	slot,
		int64_t	new_val,
		int64_t	init_val
	)

Definition at line 1193 of file cuda_mapd_rt.cu.

                                                              {
   auto slot_address = reinterpret_cast<unsigned long long int*>(slot);
   const auto empty_key =
       static_cast<unsigned long long int*>(static_cast<void*>(&init_val));
   const auto new_val_cast =
       static_cast<unsigned long long int*>(static_cast<void*>(&new_val));
 
   const auto old_val = atomicCAS(slot_address, *empty_key, *new_val_cast);
   if (old_val == *empty_key) {
     return true;
   } else {
     return false;
   }
 }

__device__ bool slotEmptyKeyCAS_int16	(	int16_t *	slot,
		int16_t	new_val,
		int16_t	init_val
	)

Definition at line 1221 of file cuda_mapd_rt.cu.

                                                                    {
   unsigned int* base_slot_address =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
   unsigned int old_value = *base_slot_address;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     // exit criteria: if init_val does not exist in the slot (some other thread has
     // succeeded)
     if (static_cast<unsigned int>(init_val) !=
         __byte_perm(
             compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x2 ? 0x3244 : 0x4410))) {
       return false;
     }
     swap_value = __byte_perm(compare_value,
                              static_cast<unsigned int>(new_val),
                              (reinterpret_cast<size_t>(slot) & 0x2) ? 0x5410 : 0x3254);
     old_value = atomicCAS(base_slot_address, compare_value, swap_value);
   } while (compare_value != old_value);
   return true;
 }

__device__ bool slotEmptyKeyCAS_int32	(	int32_t *	slot,
		int32_t	new_val,
		int32_t	init_val
	)

Definition at line 1210 of file cuda_mapd_rt.cu.

                                                                    {
   unsigned int* slot_address = reinterpret_cast<unsigned int*>(slot);
   unsigned int compare_value = static_cast<unsigned int>(init_val);
   unsigned int swap_value = static_cast<unsigned int>(new_val);
 
   const unsigned int old_value = atomicCAS(slot_address, compare_value, swap_value);
   return old_value == compare_value;
 }

__device__ bool slotEmptyKeyCAS_int8	(	int8_t *	slot,
		int8_t	new_val,
		int8_t	init_val
	)

Definition at line 1245 of file cuda_mapd_rt.cu.

                                                                  {
   // properly align the slot address:
   unsigned int* base_slot_address =
       reinterpret_cast<unsigned int*>(reinterpret_cast<size_t>(slot) & ~0x3);
   constexpr unsigned int byte_permutations[] = {0x3214, 0x3240, 0x3410, 0x4210};
   unsigned int old_value = *base_slot_address;
   unsigned int swap_value, compare_value;
   do {
     compare_value = old_value;
     // exit criteria: if init_val does not exist in the slot (some other thread has
     // succeeded)
     if (static_cast<unsigned int>(init_val) !=
         __byte_perm(compare_value, 0, (reinterpret_cast<size_t>(slot) & 0x3) | 0x4440)) {
       return false;
     }
     swap_value = __byte_perm(compare_value,
                              static_cast<unsigned int>(new_val),
                              byte_permutations[reinterpret_cast<size_t>(slot) & 0x3]);
     old_value = atomicCAS(base_slot_address, compare_value, swap_value);
   } while (compare_value != old_value);
   return true;
 }

__device__ StringView string_decode	(	int8_t *	chunk_iter_,
		int64_t	pos
	)

Definition at line 1282 of file cuda_mapd_rt.cu.

References ChunkIter_get_nth(), VarlenDatum::is_null, VarlenDatum::length, and VarlenDatum::pointer.

                                                                                  {
   // TODO(alex): de-dup, the x64 version is basically identical
   auto chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
   VarlenDatum vd;
   bool is_end;
   ChunkIter_get_nth(chunk_iter, pos, false, &vd, &is_end);
   // CHECK(!is_end); <--- this is the difference (re: above comment)
   return vd.is_null ? StringView{nullptr, 0u}
                     : StringView{reinterpret_cast<char const*>(vd.pointer), vd.length};
 }

Here is the call graph for this function:

__device__ void sync_threadblock ( )

Definition at line 1383 of file cuda_mapd_rt.cu.

Referenced by GpuSharedMemCodeBuilder::codegenInitialization(), and GpuSharedMemCodeBuilder::codegenReduction().

                                               {
   __syncthreads();
 }

Here is the caller graph for this function:

__device__ void sync_warp ( )

Definition at line 1364 of file cuda_mapd_rt.cu.

                                        {
   __syncwarp();
 }

__device__ void sync_warp_protected	(	int64_t	thread_pos,
		int64_t	row_count
	)

Protected warp synchornization to make sure all (or none) threads within a warp go through a synchronization barrier. thread_pos: the current thread position to be used for a memory access row_count: maximum number of rows to be processed The function performs warp sync iff all 32 threads within that warp will process valid data NOTE: it currently assumes that warp size is 32.

Definition at line 1375 of file cuda_mapd_rt.cu.

                                                                                       {
   // only syncing if NOT within the same warp as those threads experiencing the critical
   // edge
   if ((((row_count - 1) | 0x1F) - thread_pos) >= 32) {
     __syncwarp();
   }
 }

__device__ int8_t thread_warp_idx ( const int8_t warp_sz )

Definition at line 39 of file cuda_mapd_rt.cu.

                                                                    {
   return threadIdx.x % warp_sz;
 }

__device__ void write_back_non_grouped_agg	(	int64_t *	input_buffer,
		int64_t *	output_buffer,
		const int32_t	agg_idx
	)

Definition at line 1395 of file cuda_mapd_rt.cu.

References agg_sum_shared().

                                                                              {
   if (threadIdx.x == agg_idx) {
     agg_sum_shared(output_buffer, input_buffer[agg_idx]);
   }
 }