OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GpuInitGroups.cu
Go to the documentation of this file.
1 #include "BufferCompaction.h"
2 #include "GpuInitGroups.h"
3 #include "GpuRtConstants.h"
4 
5 template <typename T>
6 __device__ int8_t* init_columnar_buffer(T* buffer_ptr,
7  const T init_val,
8  const uint32_t entry_count,
9  const int32_t start,
10  const int32_t step) {
11  for (int32_t i = start; i < entry_count; i += step) {
12  buffer_ptr[i] = init_val;
13  }
14  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
15 }
16 
17 extern "C" __device__ void init_columnar_group_by_buffer_gpu_impl(
18  int64_t* groups_buffer,
19  const int64_t* init_vals,
20  const uint32_t groups_buffer_entry_count,
21  const uint32_t key_count,
22  const uint32_t agg_col_count,
23  const int8_t* col_sizes,
24  const bool need_padding,
25  const bool keyless,
26  const int8_t key_size) {
27  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
28  const int32_t step = blockDim.x * gridDim.x;
29 
30  int8_t* buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
31  if (!keyless) {
32  for (uint32_t i = 0; i < key_count; ++i) {
33  switch (key_size) {
34  case 1:
35  buffer_ptr = init_columnar_buffer<int8_t>(
36  buffer_ptr, EMPTY_KEY_8, groups_buffer_entry_count, start, step);
37  break;
38  case 2:
39  buffer_ptr =
40  init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
42  groups_buffer_entry_count,
43  start,
44  step);
45  break;
46  case 4:
47  buffer_ptr =
48  init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
50  groups_buffer_entry_count,
51  start,
52  step);
53  break;
54  case 8:
55  buffer_ptr =
56  init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
58  groups_buffer_entry_count,
59  start,
60  step);
61  break;
62  default:
63  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
64  break;
65  }
66  buffer_ptr = align_to_int64(buffer_ptr);
67  }
68  }
69  int32_t init_idx = 0;
70  for (int32_t i = 0; i < agg_col_count; ++i) {
71  if (need_padding) {
72  buffer_ptr = align_to_int64(buffer_ptr);
73  }
74  switch (col_sizes[i]) {
75  case 1:
76  buffer_ptr = init_columnar_buffer<int8_t>(
77  buffer_ptr, init_vals[init_idx++], groups_buffer_entry_count, start, step);
78  break;
79  case 2:
80  buffer_ptr = init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
81  init_vals[init_idx++],
82  groups_buffer_entry_count,
83  start,
84  step);
85  break;
86  case 4:
87  buffer_ptr = init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
88  init_vals[init_idx++],
89  groups_buffer_entry_count,
90  start,
91  step);
92  break;
93  case 8:
94  buffer_ptr = init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
95  init_vals[init_idx++],
96  groups_buffer_entry_count,
97  start,
98  step);
99  break;
100  case 0:
101  continue;
102  default:
103  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet now.
104  break;
105  }
106  }
107  __syncthreads();
108 }
109 
110 __device__ void init_render_buffer(int64_t* render_buffer, const uint32_t qw_count) {
111  const uint32_t start = blockIdx.x * blockDim.x + threadIdx.x;
112  const uint32_t step = blockDim.x * gridDim.x;
113  for (uint32_t i = start; i < qw_count; i += step) {
114  render_buffer[i] = EMPTY_KEY_64;
115  }
116 }
117 
118 __global__ void init_render_buffer_wrapper(int64_t* render_buffer,
119  const uint32_t qw_count) {
120  init_render_buffer(render_buffer, qw_count);
121 }
122 
123 template <typename K>
124 inline __device__ void fill_empty_device_key(K* keys_ptr,
125  const uint32_t key_count,
126  const K empty_key) {
127  for (uint32_t i = 0; i < key_count; ++i) {
128  keys_ptr[i] = empty_key;
129  }
130 }
131 
132 __global__ void init_group_by_buffer_gpu(int64_t* groups_buffer,
133  const int64_t* init_vals,
134  const uint32_t groups_buffer_entry_count,
135  const uint32_t key_count,
136  const uint32_t key_width,
137  const uint32_t row_size_quad,
138  const bool keyless,
139  const int8_t warp_size) {
140  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
141  const int32_t step = blockDim.x * gridDim.x;
142  if (keyless) {
143  for (int32_t i = start;
144  i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
145  i += step) {
146  groups_buffer[i] = init_vals[i % row_size_quad];
147  }
148  __syncthreads();
149  return;
150  }
151 
152  for (int32_t i = start; i < groups_buffer_entry_count; i += step) {
153  int64_t* keys_ptr = groups_buffer + i * row_size_quad;
154  switch (key_width) {
155  case 4:
157  reinterpret_cast<int32_t*>(keys_ptr), key_count, EMPTY_KEY_32);
158  break;
159  case 8:
161  reinterpret_cast<int64_t*>(keys_ptr), key_count, EMPTY_KEY_64);
162  break;
163  default:
164  break;
165  }
166  }
167 
168  const uint32_t values_off_quad =
169  align_to_int64(key_count * key_width) / sizeof(int64_t);
170  for (uint32_t i = start; i < groups_buffer_entry_count; i += step) {
171  int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
172  const uint32_t val_count =
173  row_size_quad - values_off_quad; // value slots are always 64-bit
174  for (uint32_t j = 0; j < val_count; ++j) {
175  vals_ptr[j] = init_vals[j];
176  }
177  }
178  __syncthreads();
179 }
180 
182  int64_t* groups_buffer,
183  const int64_t* init_vals,
184  const uint32_t groups_buffer_entry_count,
185  const uint32_t key_count,
186  const uint32_t agg_col_count,
187  const int8_t* col_sizes,
188  const bool need_padding,
189  const bool keyless,
190  const int8_t key_size) {
192  init_vals,
193  groups_buffer_entry_count,
194  key_count,
195  agg_col_count,
196  col_sizes,
197  need_padding,
198  keyless,
199  key_size);
200 }
201 
203  const int64_t* init_vals,
204  const uint32_t groups_buffer_entry_count,
205  const uint32_t key_count,
206  const uint32_t key_width,
207  const uint32_t row_size_quad,
208  const bool keyless,
209  const int8_t warp_size,
210  const size_t block_size_x,
211  const size_t grid_size_x) {
212  init_group_by_buffer_gpu<<<grid_size_x, block_size_x>>>(groups_buffer,
213  init_vals,
215  key_count,
216  key_width,
217  row_size_quad,
218  keyless,
219  warp_size);
220 }
221 
223  const int64_t* init_vals,
224  const uint32_t groups_buffer_entry_count,
225  const uint32_t key_count,
226  const uint32_t agg_col_count,
227  const int8_t* col_sizes,
228  const bool need_padding,
229  const bool keyless,
230  const int8_t key_size,
231  const size_t block_size_x,
232  const size_t grid_size_x) {
233  init_columnar_group_by_buffer_gpu_wrapper<<<grid_size_x, block_size_x>>>(
235  init_vals,
237  key_count,
239  col_sizes,
240  need_padding,
241  keyless,
242  key_size);
243 }
244 
245 void init_render_buffer_on_device(int64_t* render_buffer,
246  const uint32_t qw_count,
247  const size_t block_size_x,
248  const size_t grid_size_x) {
249  init_render_buffer_wrapper<<<grid_size_x, block_size_x>>>(render_buffer, qw_count);
250 }
const int32_t groups_buffer_size return groups_buffer
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
#define EMPTY_KEY_64
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
__device__ int8_t * init_columnar_buffer(T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
Definition: GpuInitGroups.cu:6
const int64_t const uint32_t groups_buffer_entry_count
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
__global__ void init_render_buffer_wrapper(int64_t *render_buffer, const uint32_t qw_count)
#define EMPTY_KEY_8
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
void init_render_buffer_on_device(int64_t *render_buffer, const uint32_t qw_count, const size_t block_size_x, const size_t grid_size_x)
#define EMPTY_KEY_16
#define EMPTY_KEY_32
const int64_t * init_vals
__device__ void init_render_buffer(int64_t *render_buffer, const uint32_t qw_count)
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)