OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ResultSetSortImpl.cu
Go to the documentation of this file.
1 #include <cuda.h>
3 
4 #include "BufferCompaction.h"
5 #include "GpuMemUtils.h"
6 #include "GpuRtConstants.h"
8 #include "ResultSetSortImpl.h"
9 #include "SortUtils.cuh"
10 
11 #include <thrust/copy.h>
12 #include <thrust/execution_policy.h>
13 #include <thrust/host_vector.h>
14 #include <thrust/sort.h>
15 
16 #define checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)
17 
18 #define FORCE_CPU_VERSION
19 #include "BufferEntryUtils.h"
20 #undef FORCE_CPU_VERSION
21 
22 namespace {
23 
24 template <class K, class V, class I>
25 std::vector<uint32_t> do_radix_sort(const ExecutorDeviceType device_type,
26  const int device_id,
27  ThrustAllocator& thrust_allocator,
28  const int8_t* groupby_buffer,
29  V dev_oe_col_buffer_begin,
30  V dev_oe_col_buffer_end,
31  I dev_idx_buff_begin,
32  const size_t dev_idx_buff_size,
33  const PodOrderEntry& oe,
34  const GroupByBufferLayoutInfo& layout,
35  const size_t top_n) {
36  if (dev_idx_buff_size == 0) {
37  return {};
38  }
39  if (oe.is_desc) {
40  if (device_type == ExecutorDeviceType::GPU) {
41  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
42  thrust::sort_by_key(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
43  dev_oe_col_buffer_begin,
44  dev_oe_col_buffer_end,
45  dev_idx_buff_begin,
46  thrust::greater<int64_t>());
47  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
48  } else {
49  thrust::sort_by_key(dev_oe_col_buffer_begin,
50  dev_oe_col_buffer_end,
51  dev_idx_buff_begin,
52  thrust::greater<int64_t>());
53  }
54  } else {
55  if (device_type == ExecutorDeviceType::GPU) {
56  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
57  thrust::sort_by_key(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
58  dev_oe_col_buffer_begin,
59  dev_oe_col_buffer_end,
60  dev_idx_buff_begin);
61  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
62  } else {
63  thrust::sort_by_key(
64  dev_oe_col_buffer_begin, dev_oe_col_buffer_end, dev_idx_buff_begin);
65  }
66  }
67  // Speculatively transfer only the top_n first, most of the time it'll be enough.
68  thrust::host_vector<uint32_t> host_vector_result(
69  dev_idx_buff_begin, dev_idx_buff_begin + std::min(top_n, dev_idx_buff_size));
70  // Sometimes, radix sort can bring to the front entries which are empty.
71  // For example, ascending sort on COUNT(*) will bring non-existent groups
72  // to the front of dev_idx_buff since they're 0 in our system. Re-do the
73  // transfer in that case to bring the entire dev_idx_buff; existing logic
74  // in row iteration will take care of skipping the empty rows.
75  for (size_t i = 0; i < host_vector_result.size(); ++i) {
76  const auto entry_idx = host_vector_result[i];
77  if (is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
78  host_vector_result = thrust::host_vector<uint32_t>(
79  dev_idx_buff_begin, dev_idx_buff_begin + dev_idx_buff_size);
80  break;
81  }
82  }
83  std::vector<uint32_t> result;
84  result.reserve(std::min(top_n, host_vector_result.size()));
85  for (size_t i = 0; i < host_vector_result.size(); ++i) {
86  const auto entry_idx = host_vector_result[i];
87  if (!is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
88  result.push_back(entry_idx);
89  if (result.size() >= top_n) {
90  break;
91  }
92  }
93  }
94  return result;
95 }
96 
97 void add_nulls(std::vector<uint32_t>& idx_buff,
98  const std::vector<uint32_t>& null_idx_buff,
99  const PodOrderEntry& oe) {
100  if (null_idx_buff.empty()) {
101  return;
102  }
103  const auto insertion_point = oe.nulls_first ? idx_buff.begin() : idx_buff.end();
104  idx_buff.insert(insertion_point, null_idx_buff.begin(), null_idx_buff.end());
105 }
106 
107 template <typename T>
108 thrust::device_ptr<T> get_device_copy_ptr(const thrust::host_vector<T>& host_vec,
109  ThrustAllocator& thrust_allocator) {
110  if (host_vec.empty()) {
111  return thrust::device_ptr<T>(static_cast<T*>(nullptr));
112  }
113  const auto host_vec_bytes = host_vec.size() * sizeof(T);
114  T* dev_ptr = reinterpret_cast<T*>(
115  thrust_allocator.allocateScopedBuffer(align_to_int64(host_vec_bytes)));
116  copy_to_nvidia_gpu(thrust_allocator.getDataMgr(),
117  reinterpret_cast<CUdeviceptr>(dev_ptr),
118  &host_vec[0],
119  host_vec_bytes,
120  thrust_allocator.getDeviceId());
121  return thrust::device_ptr<T>(dev_ptr);
122 }
123 
124 template <class K>
125 std::vector<uint32_t> baseline_sort_fp(const ExecutorDeviceType device_type,
126  const int device_id,
127  Data_Namespace::DataMgr* data_mgr,
128  const int8_t* groupby_buffer,
129  const thrust::host_vector<int64_t>& oe_col_buffer,
130  const PodOrderEntry& oe,
131  const GroupByBufferLayoutInfo& layout,
132  const size_t top_n,
133  const size_t start,
134  const size_t step) {
135  thrust::host_vector<uint32_t> neg_idx_buff;
136  thrust::host_vector<uint32_t> pos_idx_buff;
137  std::vector<uint32_t> null_idx_buff;
138  thrust::host_vector<int64_t> neg_oe_col_buffer;
139  thrust::host_vector<int64_t> pos_oe_col_buffer;
140  const auto slice_entry_count =
141  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
142  neg_idx_buff.reserve(slice_entry_count);
143  pos_idx_buff.reserve(slice_entry_count);
144  null_idx_buff.reserve(slice_entry_count);
145  neg_oe_col_buffer.reserve(slice_entry_count);
146  pos_oe_col_buffer.reserve(slice_entry_count);
147  size_t oe_col_buffer_idx = 0;
148  const auto& oe_info = layout.oe_target_info;
149  const auto col_ti =
150  oe_info.agg_kind == kAVG ? SQLTypeInfo(kDOUBLE, false) : oe_info.sql_type;
151  // Execlude AVG b/c collect_order_entry_column already makes its pair collapse into a
152  // double
153  const bool float_argument_input =
154  takes_float_argument(oe_info) && oe_info.agg_kind != kAVG;
155 
156  auto is_negative =
157  float_argument_input ? [](const int64_t v) -> bool { return (v & (1 << 31)) != 0; }
158  : [](const int64_t v) -> bool { return v < 0; };
159 
160  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
161  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
162  oe_col_buffer[oe_col_buffer_idx] ==
163  null_val_bit_pattern(col_ti, float_argument_input)) {
164  null_idx_buff.push_back(i);
165  continue;
166  }
167  if (is_negative(oe_col_buffer[oe_col_buffer_idx])) { // sign bit works the same for
168  // integer and floating point
169  neg_idx_buff.push_back(i);
170  neg_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
171  } else {
172  pos_idx_buff.push_back(i);
173  pos_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
174  }
175  }
176  std::vector<uint32_t> pos_result;
177  ThrustAllocator thrust_allocator(data_mgr, device_id);
178  if (device_type == ExecutorDeviceType::GPU) {
179  const auto dev_pos_idx_buff = get_device_copy_ptr(pos_idx_buff, thrust_allocator);
180  const auto dev_pos_oe_col_buffer =
181  get_device_copy_ptr(pos_oe_col_buffer, thrust_allocator);
182  pos_result = do_radix_sort<K>(device_type,
183  device_id,
184  thrust_allocator,
185  groupby_buffer,
186  dev_pos_oe_col_buffer,
187  dev_pos_oe_col_buffer + pos_oe_col_buffer.size(),
188  dev_pos_idx_buff,
189  pos_idx_buff.size(),
190  oe,
191  layout,
192  top_n);
193  } else {
194  CHECK(device_type == ExecutorDeviceType::CPU);
195  pos_result = do_radix_sort<K>(device_type,
196  device_id,
197  thrust_allocator,
198  groupby_buffer,
199  pos_oe_col_buffer.begin(),
200  pos_oe_col_buffer.end(),
201  pos_idx_buff.begin(),
202  pos_idx_buff.size(),
203  oe,
204  layout,
205  top_n);
206  }
207  std::vector<uint32_t> neg_result;
208  PodOrderEntry reverse_oe{oe.tle_no, !oe.is_desc, oe.nulls_first};
209  if (device_type == ExecutorDeviceType::GPU) {
210  const auto dev_neg_idx_buff = get_device_copy_ptr(neg_idx_buff, thrust_allocator);
211  const auto dev_neg_oe_col_buffer =
212  get_device_copy_ptr(neg_oe_col_buffer, thrust_allocator);
213  neg_result = do_radix_sort<K>(device_type,
214  device_id,
215  thrust_allocator,
216  groupby_buffer,
217  dev_neg_oe_col_buffer,
218  dev_neg_oe_col_buffer + neg_oe_col_buffer.size(),
219  dev_neg_idx_buff,
220  neg_idx_buff.size(),
221  reverse_oe,
222  layout,
223  top_n);
224  } else {
225  CHECK(device_type == ExecutorDeviceType::CPU);
226  neg_result = do_radix_sort<K>(device_type,
227  device_id,
228  thrust_allocator,
229  groupby_buffer,
230  neg_oe_col_buffer.begin(),
231  neg_oe_col_buffer.end(),
232  neg_idx_buff.begin(),
233  neg_idx_buff.size(),
234  reverse_oe,
235  layout,
236  top_n);
237  }
238  if (oe.is_desc) {
239  pos_result.insert(pos_result.end(), neg_result.begin(), neg_result.end());
240  add_nulls(pos_result, null_idx_buff, oe);
241  return pos_result;
242  }
243  neg_result.insert(neg_result.end(), pos_result.begin(), pos_result.end());
244  add_nulls(neg_result, null_idx_buff, oe);
245  return neg_result;
246 }
247 
248 template <class K>
249 std::vector<uint32_t> baseline_sort_int(const ExecutorDeviceType device_type,
250  const int device_id,
251  Data_Namespace::DataMgr* data_mgr,
252  const int8_t* groupby_buffer,
253  const thrust::host_vector<int64_t>& oe_col_buffer,
254  const PodOrderEntry& oe,
255  const GroupByBufferLayoutInfo& layout,
256  const size_t top_n,
257  const size_t start,
258  const size_t step) {
259  const auto& entry_ti = get_compact_type(layout.oe_target_info);
260  std::vector<uint32_t> null_idx_buff;
261  thrust::host_vector<uint32_t> notnull_idx_buff;
262  const auto slice_entry_count =
263  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
264  null_idx_buff.reserve(slice_entry_count);
265  notnull_idx_buff.reserve(slice_entry_count);
266  thrust::host_vector<int64_t> notnull_oe_col_buffer;
267  notnull_oe_col_buffer.reserve(slice_entry_count);
268  size_t oe_col_buffer_idx = 0;
269  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
270  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
271  oe_col_buffer[oe_col_buffer_idx] == null_val_bit_pattern(entry_ti, false)) {
272  null_idx_buff.push_back(i);
273  } else {
274  notnull_idx_buff.push_back(i);
275  notnull_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
276  }
277  }
278  std::vector<uint32_t> notnull_result;
279  ThrustAllocator thrust_allocator(data_mgr, device_id);
280  if (device_type == ExecutorDeviceType::GPU) {
281  const auto dev_notnull_idx_buff =
282  get_device_copy_ptr(notnull_idx_buff, thrust_allocator);
283  const auto dev_notnull_oe_col_buffer =
284  get_device_copy_ptr(notnull_oe_col_buffer, thrust_allocator);
285  notnull_result =
286  do_radix_sort<K>(device_type,
287  device_id,
288  thrust_allocator,
289  groupby_buffer,
290  dev_notnull_oe_col_buffer,
291  dev_notnull_oe_col_buffer + notnull_oe_col_buffer.size(),
292  dev_notnull_idx_buff,
293  notnull_idx_buff.size(),
294  oe,
295  layout,
296  top_n);
297  } else {
298  CHECK(device_type == ExecutorDeviceType::CPU);
299  notnull_result = do_radix_sort<K>(device_type,
300  device_id,
301  thrust_allocator,
302  groupby_buffer,
303  notnull_oe_col_buffer.begin(),
304  notnull_oe_col_buffer.end(),
305  notnull_idx_buff.begin(),
306  notnull_idx_buff.size(),
307  oe,
308  layout,
309  top_n);
310  }
311  add_nulls(notnull_result, null_idx_buff, oe);
312  return notnull_result;
313 }
314 
315 template <class K>
316 thrust::host_vector<int64_t> collect_order_entry_column(
317  const int8_t* groupby_buffer,
318  const GroupByBufferLayoutInfo& layout,
319  const size_t start,
320  const size_t step) {
321  thrust::host_vector<int64_t> oe_col_buffer;
322  const auto row_ptr = groupby_buffer + start * layout.row_bytes;
323  auto crt_group_ptr1 = layout.target_groupby_index >= 0
324  ? row_ptr + layout.target_groupby_index * sizeof(K)
325  : row_ptr + layout.col_off;
326  const int8_t* crt_group_ptr2{nullptr};
327  if (layout.oe_target_info.agg_kind == kAVG) {
328  crt_group_ptr2 = crt_group_ptr1 + layout.col_bytes;
329  }
330  const auto entry_ti = get_compact_type(layout.oe_target_info);
331  const bool float_argument_input = takes_float_argument(layout.oe_target_info);
332  const auto step_bytes = layout.row_bytes * step;
333  const auto col_bytes = float_argument_input ? entry_ti.get_size() : layout.col_bytes;
334  for (size_t i = start; i < layout.entry_count; i += step) {
335  auto val1 = read_int_from_buff(crt_group_ptr1, col_bytes > 0 ? col_bytes : sizeof(K));
336  if (crt_group_ptr2) {
337  const auto val2 = read_int_from_buff(crt_group_ptr2, 8);
338  const auto avg_val = pair_to_double({val1, val2}, entry_ti, float_argument_input);
339  val1 = *reinterpret_cast<const int64_t*>(&avg_val);
340  }
341  oe_col_buffer.push_back(val1);
342  crt_group_ptr1 += step_bytes;
343  if (crt_group_ptr2) {
344  crt_group_ptr2 += step_bytes;
345  }
346  }
347  return oe_col_buffer;
348 }
349 
350 } // namespace
351 
352 template <class K>
353 std::vector<uint32_t> baseline_sort(const ExecutorDeviceType device_type,
354  const int device_id,
355  Data_Namespace::DataMgr* data_mgr,
356  const int8_t* groupby_buffer,
357  const PodOrderEntry& oe,
358  const GroupByBufferLayoutInfo& layout,
359  const size_t top_n,
360  const size_t start,
361  const size_t step) {
362  auto oe_col_buffer = collect_order_entry_column<K>(groupby_buffer, layout, start, step);
363  const auto& entry_ti = get_compact_type(layout.oe_target_info);
364  CHECK(entry_ti.is_number());
365  if (entry_ti.is_fp() || layout.oe_target_info.agg_kind == kAVG) {
366  return baseline_sort_fp<K>(device_type,
367  device_id,
368  data_mgr,
369  groupby_buffer,
370  oe_col_buffer,
371  oe,
372  layout,
373  top_n,
374  start,
375  step);
376  }
377  // Because of how we represent nulls for integral types, they'd be at the
378  // wrong position in these two cases. Separate them into a different buffer.
379  if ((oe.is_desc && oe.nulls_first) || (!oe.is_desc && !oe.nulls_first)) {
380  return baseline_sort_int<K>(device_type,
381  device_id,
382  data_mgr,
383  groupby_buffer,
384  oe_col_buffer,
385  oe,
386  layout,
387  top_n,
388  start,
389  step);
390  }
391  ThrustAllocator thrust_allocator(data_mgr, device_id);
392  // Fastest path, no need to separate nulls away since they'll end up at the
393  // right place as a side effect of how we're representing nulls.
394  if (device_type == ExecutorDeviceType::GPU) {
395  if (oe_col_buffer.empty()) {
396  return {};
397  }
398  const auto dev_idx_buff =
399  get_device_ptr<uint32_t>(oe_col_buffer.size(), thrust_allocator);
400  thrust::sequence(dev_idx_buff, dev_idx_buff + oe_col_buffer.size(), start, step);
401  const auto dev_oe_col_buffer = get_device_copy_ptr(oe_col_buffer, thrust_allocator);
402  return do_radix_sort<K>(device_type,
403  device_id,
404  thrust_allocator,
405  groupby_buffer,
406  dev_oe_col_buffer,
407  dev_oe_col_buffer + oe_col_buffer.size(),
408  dev_idx_buff,
409  oe_col_buffer.size(),
410  oe,
411  layout,
412  top_n);
413  }
414  CHECK(device_type == ExecutorDeviceType::CPU);
415  thrust::host_vector<uint32_t> host_idx_buff(oe_col_buffer.size());
416  thrust::sequence(host_idx_buff.begin(), host_idx_buff.end(), start, step);
417  return do_radix_sort<K>(device_type,
418  device_id,
419  thrust_allocator,
420  groupby_buffer,
421  oe_col_buffer.begin(),
422  oe_col_buffer.end(),
423  host_idx_buff.begin(),
424  host_idx_buff.size(),
425  oe,
426  layout,
427  top_n);
428 }
429 
430 template std::vector<uint32_t> baseline_sort<int32_t>(
431  const ExecutorDeviceType device_type,
432  const int device_id,
433  Data_Namespace::DataMgr* data_mgr,
434  const int8_t* groupby_buffer,
435  const PodOrderEntry& oe,
436  const GroupByBufferLayoutInfo& layout,
437  const size_t top_n,
438  const size_t start,
439  const size_t step);
440 
441 template std::vector<uint32_t> baseline_sort<int64_t>(
442  const ExecutorDeviceType device_type,
443  const int device_id,
444  Data_Namespace::DataMgr* data_mgr,
445  const int8_t* groupby_buffer,
446  const PodOrderEntry& oe,
447  const GroupByBufferLayoutInfo& layout,
448  const size_t top_n,
449  const size_t start,
450  const size_t step);
int getDeviceId() const
Utility functions for easy access to the result set buffers.
void * CUstream
Definition: nocuda.h:23
unsigned long long CUdeviceptr
Definition: nocuda.h:28
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
Macros and functions for groupby buffer compaction.
Data_Namespace::DataMgr * getDataMgr() const
int64_t read_int_from_buff(const int8_t *ptr, const int8_t compact_sz)
double pair_to_double(const std::pair< int64_t, int64_t > &fp_pair, const SQLTypeInfo &ti, const bool float_argument_input)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
ExecutorDeviceType
bool nulls_first
const SQLTypeInfo get_compact_type(const TargetInfo &target)
int8_t * allocateScopedBuffer(std::ptrdiff_t num_bytes)
std::vector< uint32_t > baseline_sort(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
Utility functions for group by buffer entries.
void collect_order_entry_column(thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:137
SQLAgg agg_kind
Definition: TargetInfo.h:51
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
int tle_no
void do_radix_sort(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:180
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
bool is_desc
const TargetInfo oe_target_info
void copy_to_nvidia_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:35
#define CHECK(condition)
Definition: Logger.h:291
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
std::vector< uint32_t > baseline_sort_fp(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int32_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int64_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
Definition: sqldefs.h:74
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
const int64_t target_groupby_index
std::vector< uint32_t > baseline_sort_int(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)