8 const uint32_t entry_count,
11 for (int32_t i = start; i < entry_count; i += step) {
12 buffer_ptr[i] = init_val;
14 return reinterpret_cast<int8_t*
>(buffer_ptr + entry_count);
21 const uint32_t key_count,
23 const int8_t* col_sizes,
24 const bool need_padding,
26 const int8_t key_size) {
27 const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
28 const int32_t step = blockDim.x * gridDim.x;
30 int8_t* buffer_ptr =
reinterpret_cast<int8_t*
>(
groups_buffer);
32 for (uint32_t i = 0; i < key_count; ++i) {
35 buffer_ptr = init_columnar_buffer<int8_t>(
40 init_columnar_buffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
42 groups_buffer_entry_count,
48 init_columnar_buffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
50 groups_buffer_entry_count,
56 init_columnar_buffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
58 groups_buffer_entry_count,
74 switch (col_sizes[i]) {
76 buffer_ptr = init_columnar_buffer<int8_t>(
80 buffer_ptr = init_columnar_buffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
81 init_vals[init_idx++],
82 groups_buffer_entry_count,
87 buffer_ptr = init_columnar_buffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
88 init_vals[init_idx++],
89 groups_buffer_entry_count,
94 buffer_ptr = init_columnar_buffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
95 init_vals[init_idx++],
96 groups_buffer_entry_count,
111 const uint32_t start = blockIdx.x * blockDim.x + threadIdx.x;
112 const uint32_t step = blockDim.x * gridDim.x;
113 for (uint32_t i = start; i < qw_count; i += step) {
119 const uint32_t qw_count) {
123 template <
typename K>
125 const uint32_t key_count,
127 for (uint32_t i = 0; i < key_count; ++i) {
128 keys_ptr[i] = empty_key;
135 const uint32_t key_count,
136 const uint32_t key_width,
137 const uint32_t row_size_quad,
139 const int8_t warp_size) {
140 const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
141 const int32_t step = blockDim.x * gridDim.x;
143 for (int32_t i = start;
144 i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
146 groups_buffer[i] = init_vals[i % row_size_quad];
153 int64_t* keys_ptr = groups_buffer + i * row_size_quad;
157 reinterpret_cast<int32_t*>(keys_ptr), key_count,
EMPTY_KEY_32);
161 reinterpret_cast<int64_t*>(keys_ptr), key_count,
EMPTY_KEY_64);
168 const uint32_t values_off_quad =
171 int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
172 const uint32_t val_count =
173 row_size_quad - values_off_quad;
174 for (uint32_t j = 0; j < val_count; ++j) {
175 vals_ptr[j] = init_vals[j];
185 const uint32_t key_count,
187 const int8_t* col_sizes,
188 const bool need_padding,
190 const int8_t key_size) {
193 groups_buffer_entry_count,
205 const uint32_t key_count,
206 const uint32_t key_width,
207 const uint32_t row_size_quad,
209 const int8_t warp_size,
210 const size_t block_size_x,
211 const size_t grid_size_x) {
212 init_group_by_buffer_gpu<<<grid_size_x, block_size_x>>>(
groups_buffer,
225 const uint32_t key_count,
227 const int8_t* col_sizes,
228 const bool need_padding,
230 const int8_t key_size,
231 const size_t block_size_x,
232 const size_t grid_size_x) {
233 init_columnar_group_by_buffer_gpu_wrapper<<<grid_size_x, block_size_x>>>(
246 const uint32_t qw_count,
247 const size_t block_size_x,
248 const size_t grid_size_x) {
249 init_render_buffer_wrapper<<<grid_size_x, block_size_x>>>(render_buffer, qw_count);
const int32_t groups_buffer_size return groups_buffer
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
__device__ int8_t * init_columnar_buffer(T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
const int64_t const uint32_t groups_buffer_entry_count
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
__global__ void init_render_buffer_wrapper(int64_t *render_buffer, const uint32_t qw_count)
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
void init_render_buffer_on_device(int64_t *render_buffer, const uint32_t qw_count, const size_t block_size_x, const size_t grid_size_x)
const int64_t * init_vals
__device__ void init_render_buffer(int64_t *render_buffer, const uint32_t qw_count)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)