OmniSciDB  04ee39c94c
ResultSetSortImpl.cu
Go to the documentation of this file.
1 #include "BufferCompaction.h"
2 #include "GpuMemUtils.h"
3 #include "GpuRtConstants.h"
4 #include "ResultSetBufferAccessors.h"
5 #include "ResultSetSortImpl.h"
6 #include "SortUtils.cuh"
7 
8 #include <thrust/copy.h>
9 #include <thrust/execution_policy.h>
10 #include <thrust/host_vector.h>
11 #include <thrust/sort.h>
12 
13 #define FORCE_CPU_VERSION
14 #include "BufferEntryUtils.h"
15 #undef FORCE_CPU_VERSION
16 
17 namespace {
18 
19 template <class K, class V, class I>
20 std::vector<uint32_t> do_radix_sort(const ExecutorDeviceType device_type,
21  ThrustAllocator& thrust_allocator,
22  const int8_t* groupby_buffer,
23  V dev_oe_col_buffer_begin,
24  V dev_oe_col_buffer_end,
25  I dev_idx_buff_begin,
26  const size_t dev_idx_buff_size,
27  const PodOrderEntry& oe,
28  const GroupByBufferLayoutInfo& layout,
29  const size_t top_n) {
30  if (dev_idx_buff_size == 0) {
31  return {};
32  }
33  if (oe.is_desc) {
34  if (device_type == ExecutorDeviceType::GPU) {
35  thrust::sort_by_key(thrust::device(thrust_allocator),
36  dev_oe_col_buffer_begin,
37  dev_oe_col_buffer_end,
38  dev_idx_buff_begin,
39  thrust::greater<int64_t>());
40  } else {
41  thrust::sort_by_key(dev_oe_col_buffer_begin,
42  dev_oe_col_buffer_end,
43  dev_idx_buff_begin,
44  thrust::greater<int64_t>());
45  }
46  } else {
47  if (device_type == ExecutorDeviceType::GPU) {
48  thrust::sort_by_key(thrust::device(thrust_allocator),
49  dev_oe_col_buffer_begin,
50  dev_oe_col_buffer_end,
51  dev_idx_buff_begin);
52  } else {
53  thrust::sort_by_key(
54  dev_oe_col_buffer_begin, dev_oe_col_buffer_end, dev_idx_buff_begin);
55  }
56  }
57  // Speculatively transfer only the top_n first, most of the time it'll be enough.
58  thrust::host_vector<uint32_t> host_vector_result(
59  dev_idx_buff_begin, dev_idx_buff_begin + std::min(top_n, dev_idx_buff_size));
60  // Sometimes, radix sort can bring to the front entries which are empty.
61  // For example, ascending sort on COUNT(*) will bring non-existent groups
62  // to the front of dev_idx_buff since they're 0 in our system. Re-do the
63  // transfer in that case to bring the entire dev_idx_buff; existing logic
64  // in row iteration will take care of skipping the empty rows.
65  for (size_t i = 0; i < host_vector_result.size(); ++i) {
66  const auto entry_idx = host_vector_result[i];
67  if (is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
68  host_vector_result = thrust::host_vector<uint32_t>(
69  dev_idx_buff_begin, dev_idx_buff_begin + dev_idx_buff_size);
70  break;
71  }
72  }
73  std::vector<uint32_t> result;
74  result.reserve(std::min(top_n, host_vector_result.size()));
75  for (size_t i = 0; i < host_vector_result.size(); ++i) {
76  const auto entry_idx = host_vector_result[i];
77  if (!is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
78  result.push_back(entry_idx);
79  if (result.size() >= top_n) {
80  break;
81  }
82  }
83  }
84  return result;
85 }
86 
87 void add_nulls(std::vector<uint32_t>& idx_buff,
88  const std::vector<uint32_t>& null_idx_buff,
89  const PodOrderEntry& oe) {
90  if (null_idx_buff.empty()) {
91  return;
92  }
93  const auto insertion_point = oe.nulls_first ? idx_buff.begin() : idx_buff.end();
94  idx_buff.insert(insertion_point, null_idx_buff.begin(), null_idx_buff.end());
95 }
96 
97 template <typename T>
98 thrust::device_ptr<T> get_device_copy_ptr(const thrust::host_vector<T>& host_vec,
99  ThrustAllocator& thrust_allocator) {
100  if (host_vec.empty()) {
101  return thrust::device_ptr<T>(static_cast<T*>(nullptr));
102  }
103  const auto host_vec_bytes = host_vec.size() * sizeof(T);
104  T* dev_ptr = reinterpret_cast<T*>(
105  thrust_allocator.allocateScopedBuffer(align_to_int64(host_vec_bytes)));
106  copy_to_gpu(thrust_allocator.getDataMgr(),
107  reinterpret_cast<CUdeviceptr>(dev_ptr),
108  &host_vec[0],
109  host_vec_bytes,
110  thrust_allocator.getDeviceId());
111  return thrust::device_ptr<T>(dev_ptr);
112 }
113 
114 template <class K>
115 std::vector<uint32_t> baseline_sort_fp(const ExecutorDeviceType device_type,
116  const int device_id,
117  Data_Namespace::DataMgr* data_mgr,
118  const int8_t* groupby_buffer,
119  const thrust::host_vector<int64_t>& oe_col_buffer,
120  const PodOrderEntry& oe,
121  const GroupByBufferLayoutInfo& layout,
122  const size_t top_n,
123  const size_t start,
124  const size_t step) {
125  thrust::host_vector<uint32_t> neg_idx_buff;
126  thrust::host_vector<uint32_t> pos_idx_buff;
127  std::vector<uint32_t> null_idx_buff;
128  thrust::host_vector<int64_t> neg_oe_col_buffer;
129  thrust::host_vector<int64_t> pos_oe_col_buffer;
130  const auto slice_entry_count =
131  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
132  neg_idx_buff.reserve(slice_entry_count);
133  pos_idx_buff.reserve(slice_entry_count);
134  null_idx_buff.reserve(slice_entry_count);
135  neg_oe_col_buffer.reserve(slice_entry_count);
136  pos_oe_col_buffer.reserve(slice_entry_count);
137  size_t oe_col_buffer_idx = 0;
138  const auto& oe_info = layout.oe_target_info;
139  const auto col_ti =
140  oe_info.agg_kind == kAVG ? SQLTypeInfo(kDOUBLE, false) : oe_info.sql_type;
141  // Execlude AVG b/c collect_order_entry_column already makes its pair collapse into a
142  // double
143  const bool float_argument_input =
144  takes_float_argument(oe_info) && oe_info.agg_kind != kAVG;
145 
146  auto is_negative =
147  float_argument_input ? [](const int64_t v) -> bool { return (v & (1 << 31)) != 0; }
148  : [](const int64_t v) -> bool { return v < 0; };
149 
150  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
151  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
152  oe_col_buffer[oe_col_buffer_idx] ==
153  null_val_bit_pattern(col_ti, float_argument_input)) {
154  null_idx_buff.push_back(i);
155  continue;
156  }
157  if (is_negative(oe_col_buffer[oe_col_buffer_idx])) { // sign bit works the same for
158  // integer and floating point
159  neg_idx_buff.push_back(i);
160  neg_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
161  } else {
162  pos_idx_buff.push_back(i);
163  pos_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
164  }
165  }
166  std::vector<uint32_t> pos_result;
167  ThrustAllocator thrust_allocator(data_mgr, device_id);
168  if (device_type == ExecutorDeviceType::GPU) {
169  const auto dev_pos_idx_buff = get_device_copy_ptr(pos_idx_buff, thrust_allocator);
170  const auto dev_pos_oe_col_buffer =
171  get_device_copy_ptr(pos_oe_col_buffer, thrust_allocator);
172  pos_result = do_radix_sort<K>(device_type,
173  thrust_allocator,
174  groupby_buffer,
175  dev_pos_oe_col_buffer,
176  dev_pos_oe_col_buffer + pos_oe_col_buffer.size(),
177  dev_pos_idx_buff,
178  pos_idx_buff.size(),
179  oe,
180  layout,
181  top_n);
182  } else {
183  CHECK(device_type == ExecutorDeviceType::CPU);
184  pos_result = do_radix_sort<K>(device_type,
185  thrust_allocator,
186  groupby_buffer,
187  pos_oe_col_buffer.begin(),
188  pos_oe_col_buffer.end(),
189  pos_idx_buff.begin(),
190  pos_idx_buff.size(),
191  oe,
192  layout,
193  top_n);
194  }
195  std::vector<uint32_t> neg_result;
196  PodOrderEntry reverse_oe{oe.tle_no, !oe.is_desc, oe.nulls_first};
197  if (device_type == ExecutorDeviceType::GPU) {
198  const auto dev_neg_idx_buff = get_device_copy_ptr(neg_idx_buff, thrust_allocator);
199  const auto dev_neg_oe_col_buffer =
200  get_device_copy_ptr(neg_oe_col_buffer, thrust_allocator);
201  neg_result = do_radix_sort<K>(device_type,
202  thrust_allocator,
203  groupby_buffer,
204  dev_neg_oe_col_buffer,
205  dev_neg_oe_col_buffer + neg_oe_col_buffer.size(),
206  dev_neg_idx_buff,
207  neg_idx_buff.size(),
208  reverse_oe,
209  layout,
210  top_n);
211  } else {
212  CHECK(device_type == ExecutorDeviceType::CPU);
213  neg_result = do_radix_sort<K>(device_type,
214  thrust_allocator,
215  groupby_buffer,
216  neg_oe_col_buffer.begin(),
217  neg_oe_col_buffer.end(),
218  neg_idx_buff.begin(),
219  neg_idx_buff.size(),
220  reverse_oe,
221  layout,
222  top_n);
223  }
224  if (oe.is_desc) {
225  pos_result.insert(pos_result.end(), neg_result.begin(), neg_result.end());
226  add_nulls(pos_result, null_idx_buff, oe);
227  return pos_result;
228  }
229  neg_result.insert(neg_result.end(), pos_result.begin(), pos_result.end());
230  add_nulls(neg_result, null_idx_buff, oe);
231  return neg_result;
232 }
233 
234 template <class K>
235 std::vector<uint32_t> baseline_sort_int(const ExecutorDeviceType device_type,
236  const int device_id,
237  Data_Namespace::DataMgr* data_mgr,
238  const int8_t* groupby_buffer,
239  const thrust::host_vector<int64_t>& oe_col_buffer,
240  const PodOrderEntry& oe,
241  const GroupByBufferLayoutInfo& layout,
242  const size_t top_n,
243  const size_t start,
244  const size_t step) {
245  const auto& entry_ti = get_compact_type(layout.oe_target_info);
246  std::vector<uint32_t> null_idx_buff;
247  thrust::host_vector<uint32_t> notnull_idx_buff;
248  const auto slice_entry_count =
249  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
250  null_idx_buff.reserve(slice_entry_count);
251  notnull_idx_buff.reserve(slice_entry_count);
252  thrust::host_vector<int64_t> notnull_oe_col_buffer;
253  notnull_oe_col_buffer.reserve(slice_entry_count);
254  size_t oe_col_buffer_idx = 0;
255  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
256  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
257  oe_col_buffer[oe_col_buffer_idx] == null_val_bit_pattern(entry_ti, false)) {
258  null_idx_buff.push_back(i);
259  } else {
260  notnull_idx_buff.push_back(i);
261  notnull_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
262  }
263  }
264  std::vector<uint32_t> notnull_result;
265  ThrustAllocator thrust_allocator(data_mgr, device_id);
266  if (device_type == ExecutorDeviceType::GPU) {
267  const auto dev_notnull_idx_buff =
268  get_device_copy_ptr(notnull_idx_buff, thrust_allocator);
269  const auto dev_notnull_oe_col_buffer =
270  get_device_copy_ptr(notnull_oe_col_buffer, thrust_allocator);
271  notnull_result =
272  do_radix_sort<K>(device_type,
273  thrust_allocator,
274  groupby_buffer,
275  dev_notnull_oe_col_buffer,
276  dev_notnull_oe_col_buffer + notnull_oe_col_buffer.size(),
277  dev_notnull_idx_buff,
278  notnull_idx_buff.size(),
279  oe,
280  layout,
281  top_n);
282  } else {
283  CHECK(device_type == ExecutorDeviceType::CPU);
284  notnull_result = do_radix_sort<K>(device_type,
285  thrust_allocator,
286  groupby_buffer,
287  notnull_oe_col_buffer.begin(),
288  notnull_oe_col_buffer.end(),
289  notnull_idx_buff.begin(),
290  notnull_idx_buff.size(),
291  oe,
292  layout,
293  top_n);
294  }
295  add_nulls(notnull_result, null_idx_buff, oe);
296  return notnull_result;
297 }
298 
299 template <class K>
300 thrust::host_vector<int64_t> collect_order_entry_column(
301  const int8_t* groupby_buffer,
302  const GroupByBufferLayoutInfo& layout,
303  const size_t start,
304  const size_t step) {
305  thrust::host_vector<int64_t> oe_col_buffer;
306  const auto row_ptr = groupby_buffer + start * layout.row_bytes;
307  auto crt_group_ptr1 = layout.target_groupby_index >= 0
308  ? row_ptr + layout.target_groupby_index * sizeof(K)
309  : row_ptr + layout.col_off;
310  const int8_t* crt_group_ptr2{nullptr};
311  if (layout.oe_target_info.agg_kind == kAVG) {
312  crt_group_ptr2 = crt_group_ptr1 + layout.col_bytes;
313  }
314  const auto entry_ti = get_compact_type(layout.oe_target_info);
315  const bool float_argument_input = takes_float_argument(layout.oe_target_info);
316  const auto step_bytes = layout.row_bytes * step;
317  for (size_t i = start; i < layout.entry_count; i += step) {
318  auto val1 = read_int_from_buff(crt_group_ptr1,
319  layout.col_bytes > 0 ? layout.col_bytes : sizeof(K));
320  if (crt_group_ptr2) {
321  const auto val2 = read_int_from_buff(crt_group_ptr2, 8);
322  const auto avg_val = pair_to_double({val1, val2}, entry_ti, float_argument_input);
323  val1 = *reinterpret_cast<const int64_t*>(&avg_val);
324  }
325  oe_col_buffer.push_back(val1);
326  crt_group_ptr1 += step_bytes;
327  if (crt_group_ptr2) {
328  crt_group_ptr2 += step_bytes;
329  }
330  }
331  return oe_col_buffer;
332 }
333 
334 } // namespace
335 
336 template <class K>
337 std::vector<uint32_t> baseline_sort(const ExecutorDeviceType device_type,
338  const int device_id,
339  Data_Namespace::DataMgr* data_mgr,
340  const int8_t* groupby_buffer,
341  const PodOrderEntry& oe,
342  const GroupByBufferLayoutInfo& layout,
343  const size_t top_n,
344  const size_t start,
345  const size_t step) {
346  auto oe_col_buffer = collect_order_entry_column<K>(groupby_buffer, layout, start, step);
347  const auto& entry_ti = get_compact_type(layout.oe_target_info);
348  CHECK(entry_ti.is_number());
349  if (entry_ti.is_fp() || layout.oe_target_info.agg_kind == kAVG) {
350  return baseline_sort_fp<K>(device_type,
351  device_id,
352  data_mgr,
353  groupby_buffer,
354  oe_col_buffer,
355  oe,
356  layout,
357  top_n,
358  start,
359  step);
360  }
361  // Because of how we represent nulls for integral types, they'd be at the
362  // wrong position in these two cases. Separate them into a different buffer.
363  if ((oe.is_desc && oe.nulls_first) || (!oe.is_desc && !oe.nulls_first)) {
364  return baseline_sort_int<K>(device_type,
365  device_id,
366  data_mgr,
367  groupby_buffer,
368  oe_col_buffer,
369  oe,
370  layout,
371  top_n,
372  start,
373  step);
374  }
375  ThrustAllocator thrust_allocator(data_mgr, device_id);
376  // Fastest path, no need to separate nulls away since they'll end up at the
377  // right place as a side effect of how we're representing nulls.
378  if (device_type == ExecutorDeviceType::GPU) {
379  if (oe_col_buffer.empty()) {
380  return {};
381  }
382  const auto dev_idx_buff =
383  get_device_ptr<uint32_t>(oe_col_buffer.size(), thrust_allocator);
384  thrust::sequence(dev_idx_buff, dev_idx_buff + oe_col_buffer.size(), start, step);
385  const auto dev_oe_col_buffer = get_device_copy_ptr(oe_col_buffer, thrust_allocator);
386  return do_radix_sort<K>(device_type,
387  thrust_allocator,
388  groupby_buffer,
389  dev_oe_col_buffer,
390  dev_oe_col_buffer + oe_col_buffer.size(),
391  dev_idx_buff,
392  oe_col_buffer.size(),
393  oe,
394  layout,
395  top_n);
396  }
397  CHECK(device_type == ExecutorDeviceType::CPU);
398  thrust::host_vector<uint32_t> host_idx_buff(oe_col_buffer.size());
399  thrust::sequence(host_idx_buff.begin(), host_idx_buff.end(), start, step);
400  return do_radix_sort<K>(device_type,
401  thrust_allocator,
402  groupby_buffer,
403  oe_col_buffer.begin(),
404  oe_col_buffer.end(),
405  host_idx_buff.begin(),
406  host_idx_buff.size(),
407  oe,
408  layout,
409  top_n);
410 }
411 
412 template std::vector<uint32_t> baseline_sort<int32_t>(
413  const ExecutorDeviceType device_type,
414  const int device_id,
415  Data_Namespace::DataMgr* data_mgr,
416  const int8_t* groupby_buffer,
417  const PodOrderEntry& oe,
418  const GroupByBufferLayoutInfo& layout,
419  const size_t top_n,
420  const size_t start,
421  const size_t step);
422 
423 template std::vector<uint32_t> baseline_sort<int64_t>(
424  const ExecutorDeviceType device_type,
425  const int device_id,
426  Data_Namespace::DataMgr* data_mgr,
427  const int8_t* groupby_buffer,
428  const PodOrderEntry& oe,
429  const GroupByBufferLayoutInfo& layout,
430  const size_t top_n,
431  const size_t start,
432  const size_t step);