OmniSciDB  cde582ebc3
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuInitGroups.cu
Go to the documentation of this file.
1 #include <cuda.h>
3 
4 #include "BufferCompaction.h"
5 #include "GpuInitGroups.h"
6 #include "GpuRtConstants.h"
7 #include "Logger/Logger.h"
8 
9 #define checkCudaErrors(err) CHECK_EQ(err, cudaSuccess)
10 
11 template <typename T>
12 __device__ int8_t* init_columnar_buffer(T* buffer_ptr,
13  const T init_val,
14  const uint32_t entry_count,
15  const int32_t start,
16  const int32_t step) {
17  for (int32_t i = start; i < entry_count; i += step) {
18  buffer_ptr[i] = init_val;
19  }
20  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
21 }
22 
23 extern "C" __device__ void init_columnar_group_by_buffer_gpu_impl(
24  int64_t* groups_buffer,
25  const int64_t* init_vals,
26  const uint32_t groups_buffer_entry_count,
27  const uint32_t key_count,
28  const uint32_t agg_col_count,
29  const int8_t* col_sizes,
30  const bool need_padding,
31  const bool keyless,
32  const int8_t key_size) {
33  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
34  const int32_t step = blockDim.x * gridDim.x;
35 
36  int8_t* buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
37  if (!keyless) {
38  for (uint32_t i = 0; i < key_count; ++i) {
39  switch (key_size) {
40  case 1:
41  buffer_ptr = init_columnar_buffer<int8_t>(
42  buffer_ptr, EMPTY_KEY_8, groups_buffer_entry_count, start, step);
43  break;
44  case 2:
45  buffer_ptr =
46  init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
48  groups_buffer_entry_count,
49  start,
50  step);
51  break;
52  case 4:
53  buffer_ptr =
54  init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
56  groups_buffer_entry_count,
57  start,
58  step);
59  break;
60  case 8:
61  buffer_ptr =
62  init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
64  groups_buffer_entry_count,
65  start,
66  step);
67  break;
68  default:
69  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
70  break;
71  }
72  buffer_ptr = align_to_int64(buffer_ptr);
73  }
74  }
75  int32_t init_idx = 0;
76  for (int32_t i = 0; i < agg_col_count; ++i) {
77  if (need_padding) {
78  buffer_ptr = align_to_int64(buffer_ptr);
79  }
80  switch (col_sizes[i]) {
81  case 1:
82  buffer_ptr = init_columnar_buffer<int8_t>(
83  buffer_ptr, init_vals[init_idx++], groups_buffer_entry_count, start, step);
84  break;
85  case 2:
86  buffer_ptr = init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
87  init_vals[init_idx++],
88  groups_buffer_entry_count,
89  start,
90  step);
91  break;
92  case 4:
93  buffer_ptr = init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
94  init_vals[init_idx++],
95  groups_buffer_entry_count,
96  start,
97  step);
98  break;
99  case 8:
100  buffer_ptr = init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
101  init_vals[init_idx++],
102  groups_buffer_entry_count,
103  start,
104  step);
105  break;
106  case 0:
107  continue;
108  default:
109  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet now.
110  break;
111  }
112  }
113  __syncthreads();
114 }
115 
116 template <typename K>
117 inline __device__ void fill_empty_device_key(K* keys_ptr,
118  const uint32_t key_count,
119  const K empty_key) {
120  for (uint32_t i = 0; i < key_count; ++i) {
121  keys_ptr[i] = empty_key;
122  }
123 }
124 
125 __global__ void init_group_by_buffer_gpu(int64_t* groups_buffer,
126  const int64_t* init_vals,
127  const uint32_t groups_buffer_entry_count,
128  const uint32_t key_count,
129  const uint32_t key_width,
130  const uint32_t row_size_quad,
131  const bool keyless,
132  const int8_t warp_size) {
133  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
134  const int32_t step = blockDim.x * gridDim.x;
135  if (keyless) {
136  for (int32_t i = start;
137  i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
138  i += step) {
139  groups_buffer[i] = init_vals[i % row_size_quad];
140  }
141  __syncthreads();
142  return;
143  }
144 
145  for (int32_t i = start; i < groups_buffer_entry_count; i += step) {
146  int64_t* keys_ptr = groups_buffer + i * row_size_quad;
147  switch (key_width) {
148  case 4:
150  reinterpret_cast<int32_t*>(keys_ptr), key_count, EMPTY_KEY_32);
151  break;
152  case 8:
154  reinterpret_cast<int64_t*>(keys_ptr), key_count, EMPTY_KEY_64);
155  break;
156  default:
157  break;
158  }
159  }
160 
161  const uint32_t values_off_quad =
162  align_to_int64(key_count * key_width) / sizeof(int64_t);
163  for (uint32_t i = start; i < groups_buffer_entry_count; i += step) {
164  int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
165  const uint32_t val_count =
166  row_size_quad - values_off_quad; // value slots are always 64-bit
167  for (uint32_t j = 0; j < val_count; ++j) {
168  vals_ptr[j] = init_vals[j];
169  }
170  }
171  __syncthreads();
172 }
173 
175  int64_t* groups_buffer,
176  const int64_t* init_vals,
177  const uint32_t groups_buffer_entry_count,
178  const uint32_t key_count,
179  const uint32_t agg_col_count,
180  const int8_t* col_sizes,
181  const bool need_padding,
182  const bool keyless,
183  const int8_t key_size) {
185  init_vals,
186  groups_buffer_entry_count,
187  key_count,
188  agg_col_count,
189  col_sizes,
190  need_padding,
191  keyless,
192  key_size);
193 }
194 
195 void init_group_by_buffer_on_device(int64_t* groups_buffer,
196  const int64_t* init_vals,
197  const uint32_t groups_buffer_entry_count,
198  const uint32_t key_count,
199  const uint32_t key_width,
200  const uint32_t row_size_quad,
201  const bool keyless,
202  const int8_t warp_size,
203  const size_t block_size_x,
204  const size_t grid_size_x) {
205  auto qe_cuda_stream = getQueryEngineCudaStream();
206  init_group_by_buffer_gpu<<<grid_size_x, block_size_x, 0, qe_cuda_stream>>>(
207  groups_buffer,
208  init_vals,
209  groups_buffer_entry_count,
210  key_count,
211  key_width,
212  row_size_quad,
213  keyless,
214  warp_size);
215  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
216 }
217 
218 void init_columnar_group_by_buffer_on_device(int64_t* groups_buffer,
219  const int64_t* init_vals,
220  const uint32_t groups_buffer_entry_count,
221  const uint32_t key_count,
222  const uint32_t agg_col_count,
223  const int8_t* col_sizes,
224  const bool need_padding,
225  const bool keyless,
226  const int8_t key_size,
227  const size_t block_size_x,
228  const size_t grid_size_x) {
229  auto qe_cuda_stream = getQueryEngineCudaStream();
231  block_size_x,
232  0,
233  qe_cuda_stream>>>(groups_buffer,
234  init_vals,
235  groups_buffer_entry_count,
236  key_count,
237  agg_col_count,
238  col_sizes,
239  need_padding,
240  keyless,
241  key_size);
242  checkCudaErrors(cudaStreamSynchronize(qe_cuda_stream));
243 }
#define EMPTY_KEY_64
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
void * CUstream
Definition: nocuda.h:23
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
__device__ int8_t * init_columnar_buffer(T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
Macros and functions for groupby buffer compaction.
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
CUstream getQueryEngineCudaStream()
Definition: QueryEngine.cpp:3
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
#define EMPTY_KEY_8
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
#define EMPTY_KEY_16
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
#define EMPTY_KEY_32
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)