2 #include <thrust/copy.h>
3 #include <thrust/device_vector.h>
4 #include <thrust/execution_policy.h>
5 #include <thrust/gather.h>
6 #include <thrust/sort.h>
17 const uint64_t entry_count,
20 thrust::device_ptr<T> key_ptr(val_buff);
21 thrust::device_ptr<int32_t> idx_ptr(idx_buff);
22 thrust::sequence(idx_ptr, idx_ptr + entry_count);
24 thrust::sort_by_key(thrust::device(alloc),
26 key_ptr + entry_count,
28 thrust::greater<T>());
30 thrust::sort_by_key(thrust::device(alloc), key_ptr, key_ptr + entry_count, idx_ptr);
37 const uint64_t entry_count,
39 thrust::device_ptr<T> key_ptr(val_buff);
40 thrust::device_ptr<int32_t> idx_ptr(idx_buff);
41 const size_t buf_size = entry_count *
sizeof(
T);
42 T* raw_ptr =
reinterpret_cast<T*
>(alloc.
allocate(buf_size));
43 thrust::device_ptr<T> tmp_ptr(raw_ptr);
44 thrust::copy(thrust::device(alloc), key_ptr, key_ptr + entry_count, tmp_ptr);
45 thrust::gather(thrust::device(alloc), idx_ptr, idx_ptr + entry_count, tmp_ptr, key_ptr);
46 alloc.
deallocate(reinterpret_cast<int8_t*>(raw_ptr), buf_size);
52 const uint64_t entry_count,
54 thrust::sequence(idx_buff, idx_buff + entry_count);
56 thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff, thrust::greater<T>());
58 thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff);
65 const uint64_t entry_count,
67 thrust::copy(val_buff, val_buff + entry_count, tmp_buff);
68 thrust::gather(idx_buff, idx_buff + entry_count, tmp_buff, val_buff);
74 const uint64_t entry_count,
76 const uint32_t chosen_bytes,
79 switch (chosen_bytes) {
82 reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc, alloc);
86 reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc, alloc);
90 reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc, alloc);
93 sort_on_gpu(val_buff, idx_buff, entry_count, desc, alloc);
104 const uint64_t entry_count,
106 const uint32_t chosen_bytes) {
108 switch (chosen_bytes) {
110 sort_on_cpu(reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc);
113 sort_on_cpu(reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc);
116 sort_on_cpu(reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc);
119 sort_on_cpu(val_buff, idx_buff, entry_count, desc);
130 const uint64_t entry_count,
131 const uint32_t chosen_bytes,
134 switch (chosen_bytes) {
137 reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, alloc);
141 reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, alloc);
145 reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, alloc);
159 const uint64_t entry_count,
161 const uint32_t chosen_bytes) {
163 switch (chosen_bytes) {
168 reinterpret_cast<int8_t*>(tmp_buff));
174 reinterpret_cast<int16_t*>(tmp_buff));
180 reinterpret_cast<int32_t*>(tmp_buff));
int8_t * allocate(std::ptrdiff_t num_bytes)
void sort_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes)
void deallocate(int8_t *ptr, size_t num_bytes)
DEVICE auto copy(ARGS &&...args)
void apply_permutation_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const uint32_t chosen_bytes, ThrustAllocator &alloc)
void apply_permutation_on_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, int64_t *tmp_buff, const uint32_t chosen_bytes)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)