6 #include "SortUtils.cuh"
8 #include <thrust/copy.h>
9 #include <thrust/execution_policy.h>
10 #include <thrust/host_vector.h>
11 #include <thrust/sort.h>
13 #define FORCE_CPU_VERSION
15 #undef FORCE_CPU_VERSION
19 template <
class K,
class V,
class I>
22 const int8_t* groupby_buffer,
23 V dev_oe_col_buffer_begin,
24 V dev_oe_col_buffer_end,
26 const size_t dev_idx_buff_size,
30 if (dev_idx_buff_size == 0) {
35 thrust::sort_by_key(thrust::device(thrust_allocator),
36 dev_oe_col_buffer_begin,
37 dev_oe_col_buffer_end,
39 thrust::greater<int64_t>());
41 thrust::sort_by_key(dev_oe_col_buffer_begin,
42 dev_oe_col_buffer_end,
44 thrust::greater<int64_t>());
48 thrust::sort_by_key(thrust::device(thrust_allocator),
49 dev_oe_col_buffer_begin,
50 dev_oe_col_buffer_end,
54 dev_oe_col_buffer_begin, dev_oe_col_buffer_end, dev_idx_buff_begin);
58 thrust::host_vector<uint32_t> host_vector_result(
59 dev_idx_buff_begin, dev_idx_buff_begin + std::min(top_n, dev_idx_buff_size));
65 for (
size_t i = 0;
i < host_vector_result.size(); ++
i) {
66 const auto entry_idx = host_vector_result[
i];
67 if (is_empty_entry<K>(entry_idx, groupby_buffer, layout.
row_bytes)) {
68 host_vector_result = thrust::host_vector<uint32_t>(
69 dev_idx_buff_begin, dev_idx_buff_begin + dev_idx_buff_size);
73 std::vector<uint32_t>
result;
74 result.reserve(std::min(top_n, host_vector_result.size()));
75 for (
size_t i = 0;
i < host_vector_result.size(); ++
i) {
76 const auto entry_idx = host_vector_result[
i];
77 if (!is_empty_entry<K>(entry_idx, groupby_buffer, layout.
row_bytes)) {
78 result.push_back(entry_idx);
79 if (result.size() >= top_n) {
88 const std::vector<uint32_t>& null_idx_buff,
90 if (null_idx_buff.empty()) {
93 const auto insertion_point = oe.
nulls_first ? idx_buff.begin() : idx_buff.end();
94 idx_buff.insert(insertion_point, null_idx_buff.begin(), null_idx_buff.end());
100 if (host_vec.empty()) {
101 return thrust::device_ptr<T>(
static_cast<T*
>(
nullptr));
103 const auto host_vec_bytes = host_vec.size() *
sizeof(
T);
104 T* dev_ptr =
reinterpret_cast<T*
>(
111 return thrust::device_ptr<T>(dev_ptr);
118 const int8_t* groupby_buffer,
119 const thrust::host_vector<int64_t>& oe_col_buffer,
125 thrust::host_vector<uint32_t> neg_idx_buff;
126 thrust::host_vector<uint32_t> pos_idx_buff;
127 std::vector<uint32_t> null_idx_buff;
128 thrust::host_vector<int64_t> neg_oe_col_buffer;
129 thrust::host_vector<int64_t> pos_oe_col_buffer;
130 const auto slice_entry_count =
132 neg_idx_buff.reserve(slice_entry_count);
133 pos_idx_buff.reserve(slice_entry_count);
134 null_idx_buff.reserve(slice_entry_count);
135 neg_oe_col_buffer.reserve(slice_entry_count);
136 pos_oe_col_buffer.reserve(slice_entry_count);
137 size_t oe_col_buffer_idx = 0;
143 const bool float_argument_input =
147 float_argument_input ? [](
const int64_t v) ->
bool {
return (v & (1 << 31)) != 0; }
148 : [](
const int64_t v) ->
bool {
return v < 0; };
150 for (
size_t i = start;
i < layout.
entry_count;
i += step, ++oe_col_buffer_idx) {
151 if (!is_empty_entry<K>(
i, groupby_buffer, layout.
row_bytes) &&
152 oe_col_buffer[oe_col_buffer_idx] ==
154 null_idx_buff.push_back(
i);
157 if (is_negative(oe_col_buffer[oe_col_buffer_idx])) {
159 neg_idx_buff.push_back(
i);
160 neg_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
162 pos_idx_buff.push_back(
i);
163 pos_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
166 std::vector<uint32_t> pos_result;
170 const auto dev_pos_oe_col_buffer =
172 pos_result = do_radix_sort<K>(device_type,
175 dev_pos_oe_col_buffer,
176 dev_pos_oe_col_buffer + pos_oe_col_buffer.size(),
184 pos_result = do_radix_sort<K>(device_type,
187 pos_oe_col_buffer.begin(),
188 pos_oe_col_buffer.end(),
189 pos_idx_buff.begin(),
195 std::vector<uint32_t> neg_result;
199 const auto dev_neg_oe_col_buffer =
201 neg_result = do_radix_sort<K>(device_type,
204 dev_neg_oe_col_buffer,
205 dev_neg_oe_col_buffer + neg_oe_col_buffer.size(),
213 neg_result = do_radix_sort<K>(device_type,
216 neg_oe_col_buffer.begin(),
217 neg_oe_col_buffer.end(),
218 neg_idx_buff.begin(),
225 pos_result.insert(pos_result.end(), neg_result.begin(), neg_result.end());
226 add_nulls(pos_result, null_idx_buff, oe);
229 neg_result.insert(neg_result.end(), pos_result.begin(), pos_result.end());
230 add_nulls(neg_result, null_idx_buff, oe);
238 const int8_t* groupby_buffer,
239 const thrust::host_vector<int64_t>& oe_col_buffer,
246 std::vector<uint32_t> null_idx_buff;
247 thrust::host_vector<uint32_t> notnull_idx_buff;
248 const auto slice_entry_count =
250 null_idx_buff.reserve(slice_entry_count);
251 notnull_idx_buff.reserve(slice_entry_count);
252 thrust::host_vector<int64_t> notnull_oe_col_buffer;
253 notnull_oe_col_buffer.reserve(slice_entry_count);
254 size_t oe_col_buffer_idx = 0;
255 for (
size_t i = start;
i < layout.
entry_count;
i += step, ++oe_col_buffer_idx) {
256 if (!is_empty_entry<K>(
i, groupby_buffer, layout.
row_bytes) &&
258 null_idx_buff.push_back(
i);
260 notnull_idx_buff.push_back(
i);
261 notnull_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
264 std::vector<uint32_t> notnull_result;
267 const auto dev_notnull_idx_buff =
269 const auto dev_notnull_oe_col_buffer =
272 do_radix_sort<K>(device_type,
275 dev_notnull_oe_col_buffer,
276 dev_notnull_oe_col_buffer + notnull_oe_col_buffer.size(),
277 dev_notnull_idx_buff,
278 notnull_idx_buff.size(),
284 notnull_result = do_radix_sort<K>(device_type,
287 notnull_oe_col_buffer.begin(),
288 notnull_oe_col_buffer.end(),
289 notnull_idx_buff.begin(),
290 notnull_idx_buff.size(),
295 add_nulls(notnull_result, null_idx_buff, oe);
296 return notnull_result;
301 const int8_t* groupby_buffer,
305 thrust::host_vector<int64_t> oe_col_buffer;
306 const auto row_ptr = groupby_buffer + start * layout.
row_bytes;
310 const int8_t* crt_group_ptr2{
nullptr};
311 if (layout.oe_target_info.agg_kind ==
kAVG) {
312 crt_group_ptr2 = crt_group_ptr1 + layout.col_bytes;
316 const auto step_bytes = layout.row_bytes * step;
317 const auto col_bytes = float_argument_input ? entry_ti.get_size() : layout.col_bytes;
318 for (
size_t i = start;
i < layout.entry_count;
i += step) {
319 auto val1 =
read_int_from_buff(crt_group_ptr1, col_bytes > 0 ? col_bytes :
sizeof(K));
320 if (crt_group_ptr2) {
322 const auto avg_val =
pair_to_double({val1, val2}, entry_ti, float_argument_input);
323 val1 = *
reinterpret_cast<const int64_t*
>(&avg_val);
325 oe_col_buffer.push_back(val1);
326 crt_group_ptr1 += step_bytes;
327 if (crt_group_ptr2) {
328 crt_group_ptr2 += step_bytes;
331 return oe_col_buffer;
340 const int8_t* groupby_buffer,
346 auto oe_col_buffer = collect_order_entry_column<K>(groupby_buffer, layout, start, step);
348 CHECK(entry_ti.is_number());
349 if (entry_ti.is_fp() || layout.oe_target_info.agg_kind ==
kAVG) {
350 return baseline_sort_fp<K>(device_type,
364 return baseline_sort_int<K>(device_type,
379 if (oe_col_buffer.empty()) {
382 const auto dev_idx_buff =
383 get_device_ptr<uint32_t>(oe_col_buffer.size(), thrust_allocator);
384 thrust::sequence(dev_idx_buff, dev_idx_buff + oe_col_buffer.size(), start, step);
386 return do_radix_sort<K>(device_type,
390 dev_oe_col_buffer + oe_col_buffer.size(),
392 oe_col_buffer.size(),
398 thrust::host_vector<uint32_t> host_idx_buff(oe_col_buffer.size());
399 thrust::sequence(host_idx_buff.begin(), host_idx_buff.end(), start, step);
400 return do_radix_sort<K>(device_type,
403 oe_col_buffer.begin(),
405 host_idx_buff.begin(),
406 host_idx_buff.size(),
416 const int8_t* groupby_buffer,
427 const int8_t* groupby_buffer,
Utility functions for easy access to the result set buffers.
unsigned long long CUdeviceptr
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
Data_Namespace::DataMgr * getDataMgr() const
int64_t read_int_from_buff(const int8_t *ptr, const int8_t compact_sz)
double pair_to_double(const std::pair< int64_t, int64_t > &fp_pair, const SQLTypeInfo &ti, const bool float_argument_input)
bool takes_float_argument(const TargetInfo &target_info)
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
const SQLTypeInfo get_compact_type(const TargetInfo &target)
int8_t * allocateScopedBuffer(std::ptrdiff_t num_bytes)
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
std::vector< uint32_t > baseline_sort(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
const TargetInfo oe_target_info
void collect_order_entry_column(thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator)
std::vector< uint32_t > baseline_sort_fp(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int32_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int64_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
void do_radix_sort(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
const int64_t target_groupby_index
std::vector< uint32_t > baseline_sort_int(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)