OmniSciDB  eb3a3d0a03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuInitGroups.cu
Go to the documentation of this file.
1 #include "BufferCompaction.h"
2 #include "GpuInitGroups.h"
3 #include "GpuRtConstants.h"
4 
5 template <typename T>
6 __device__ int8_t* init_columnar_buffer(T* buffer_ptr,
7  const T init_val,
8  const uint32_t entry_count,
9  const int32_t start,
10  const int32_t step) {
11  for (int32_t i = start; i < entry_count; i += step) {
12  buffer_ptr[i] = init_val;
13  }
14  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
15 }
16 
17 extern "C" __device__ void init_columnar_group_by_buffer_gpu_impl(
18  int64_t* groups_buffer,
19  const int64_t* init_vals,
20  const uint32_t groups_buffer_entry_count,
21  const uint32_t key_count,
22  const uint32_t agg_col_count,
23  const int8_t* col_sizes,
24  const bool need_padding,
25  const bool keyless,
26  const int8_t key_size) {
27  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
28  const int32_t step = blockDim.x * gridDim.x;
29 
30  int8_t* buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
31  if (!keyless) {
32  for (uint32_t i = 0; i < key_count; ++i) {
33  switch (key_size) {
34  case 1:
35  buffer_ptr = init_columnar_buffer<int8_t>(
36  buffer_ptr, EMPTY_KEY_8, groups_buffer_entry_count, start, step);
37  break;
38  case 2:
39  buffer_ptr =
40  init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
42  groups_buffer_entry_count,
43  start,
44  step);
45  break;
46  case 4:
47  buffer_ptr =
48  init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
50  groups_buffer_entry_count,
51  start,
52  step);
53  break;
54  case 8:
55  buffer_ptr =
56  init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
58  groups_buffer_entry_count,
59  start,
60  step);
61  break;
62  default:
63  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
64  break;
65  }
66  buffer_ptr = align_to_int64(buffer_ptr);
67  }
68  }
69  int32_t init_idx = 0;
70  for (int32_t i = 0; i < agg_col_count; ++i) {
71  if (need_padding) {
72  buffer_ptr = align_to_int64(buffer_ptr);
73  }
74  switch (col_sizes[i]) {
75  case 1:
76  buffer_ptr = init_columnar_buffer<int8_t>(
77  buffer_ptr, init_vals[init_idx++], groups_buffer_entry_count, start, step);
78  break;
79  case 2:
80  buffer_ptr = init_columnar_buffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
81  init_vals[init_idx++],
82  groups_buffer_entry_count,
83  start,
84  step);
85  break;
86  case 4:
87  buffer_ptr = init_columnar_buffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
88  init_vals[init_idx++],
89  groups_buffer_entry_count,
90  start,
91  step);
92  break;
93  case 8:
94  buffer_ptr = init_columnar_buffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
95  init_vals[init_idx++],
96  groups_buffer_entry_count,
97  start,
98  step);
99  break;
100  case 0:
101  continue;
102  default:
103  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet now.
104  break;
105  }
106  }
107  __syncthreads();
108 }
109 
110 template <typename K>
111 inline __device__ void fill_empty_device_key(K* keys_ptr,
112  const uint32_t key_count,
113  const K empty_key) {
114  for (uint32_t i = 0; i < key_count; ++i) {
115  keys_ptr[i] = empty_key;
116  }
117 }
118 
119 __global__ void init_group_by_buffer_gpu(int64_t* groups_buffer,
120  const int64_t* init_vals,
121  const uint32_t groups_buffer_entry_count,
122  const uint32_t key_count,
123  const uint32_t key_width,
124  const uint32_t row_size_quad,
125  const bool keyless,
126  const int8_t warp_size) {
127  const int32_t start = blockIdx.x * blockDim.x + threadIdx.x;
128  const int32_t step = blockDim.x * gridDim.x;
129  if (keyless) {
130  for (int32_t i = start;
131  i < groups_buffer_entry_count * row_size_quad * static_cast<int32_t>(warp_size);
132  i += step) {
133  groups_buffer[i] = init_vals[i % row_size_quad];
134  }
135  __syncthreads();
136  return;
137  }
138 
139  for (int32_t i = start; i < groups_buffer_entry_count; i += step) {
140  int64_t* keys_ptr = groups_buffer + i * row_size_quad;
141  switch (key_width) {
142  case 4:
144  reinterpret_cast<int32_t*>(keys_ptr), key_count, EMPTY_KEY_32);
145  break;
146  case 8:
148  reinterpret_cast<int64_t*>(keys_ptr), key_count, EMPTY_KEY_64);
149  break;
150  default:
151  break;
152  }
153  }
154 
155  const uint32_t values_off_quad =
156  align_to_int64(key_count * key_width) / sizeof(int64_t);
157  for (uint32_t i = start; i < groups_buffer_entry_count; i += step) {
158  int64_t* vals_ptr = groups_buffer + i * row_size_quad + values_off_quad;
159  const uint32_t val_count =
160  row_size_quad - values_off_quad; // value slots are always 64-bit
161  for (uint32_t j = 0; j < val_count; ++j) {
162  vals_ptr[j] = init_vals[j];
163  }
164  }
165  __syncthreads();
166 }
167 
169  int64_t* groups_buffer,
170  const int64_t* init_vals,
171  const uint32_t groups_buffer_entry_count,
172  const uint32_t key_count,
173  const uint32_t agg_col_count,
174  const int8_t* col_sizes,
175  const bool need_padding,
176  const bool keyless,
177  const int8_t key_size) {
179  init_vals,
180  groups_buffer_entry_count,
181  key_count,
182  agg_col_count,
183  col_sizes,
184  need_padding,
185  keyless,
186  key_size);
187 }
188 
189 void init_group_by_buffer_on_device(int64_t* groups_buffer,
190  const int64_t* init_vals,
191  const uint32_t groups_buffer_entry_count,
192  const uint32_t key_count,
193  const uint32_t key_width,
194  const uint32_t row_size_quad,
195  const bool keyless,
196  const int8_t warp_size,
197  const size_t block_size_x,
198  const size_t grid_size_x) {
199  init_group_by_buffer_gpu<<<grid_size_x, block_size_x>>>(groups_buffer,
200  init_vals,
201  groups_buffer_entry_count,
202  key_count,
203  key_width,
204  row_size_quad,
205  keyless,
206  warp_size);
207 }
208 
209 void init_columnar_group_by_buffer_on_device(int64_t* groups_buffer,
210  const int64_t* init_vals,
211  const uint32_t groups_buffer_entry_count,
212  const uint32_t key_count,
213  const uint32_t agg_col_count,
214  const int8_t* col_sizes,
215  const bool need_padding,
216  const bool keyless,
217  const int8_t key_size,
218  const size_t block_size_x,
219  const size_t grid_size_x) {
220  init_columnar_group_by_buffer_gpu_wrapper<<<grid_size_x, block_size_x>>>(
221  groups_buffer,
222  init_vals,
223  groups_buffer_entry_count,
224  key_count,
225  agg_col_count,
226  col_sizes,
227  need_padding,
228  keyless,
229  key_size);
230 }
#define EMPTY_KEY_64
__global__ void init_columnar_group_by_buffer_gpu_wrapper(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
__global__ void init_group_by_buffer_gpu(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
__device__ int8_t * init_columnar_buffer(T *buffer_ptr, const T init_val, const uint32_t entry_count, const int32_t start, const int32_t step)
Definition: GpuInitGroups.cu:6
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
__device__ void fill_empty_device_key(K *keys_ptr, const uint32_t key_count, const K empty_key)
#define EMPTY_KEY_8
__device__ void init_columnar_group_by_buffer_gpu_impl(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size)
#define EMPTY_KEY_16
#define EMPTY_KEY_32
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)