OmniSciDB  04ee39c94c
InPlaceSortImpl.cu
Go to the documentation of this file.
1 #ifdef HAVE_CUDA
2 #include <thrust/copy.h>
3 #include <thrust/device_vector.h>
4 #include <thrust/execution_policy.h>
5 #include <thrust/gather.h>
6 #include <thrust/sort.h>
7 #endif
8 
9 #include "Allocators/ThrustAllocator.h"
10 #include "InPlaceSortImpl.h"
11 
12 #ifdef HAVE_CUDA
13 
14 template <typename T>
15 void sort_on_gpu(T* val_buff,
16  int32_t* idx_buff,
17  const uint64_t entry_count,
18  const bool desc,
19  ThrustAllocator& alloc) {
20  thrust::device_ptr<T> key_ptr(val_buff);
21  thrust::device_ptr<int32_t> idx_ptr(idx_buff);
22  thrust::sequence(idx_ptr, idx_ptr + entry_count);
23  if (desc) {
24  thrust::sort_by_key(thrust::device(alloc),
25  key_ptr,
26  key_ptr + entry_count,
27  idx_ptr,
28  thrust::greater<T>());
29  } else {
30  thrust::sort_by_key(thrust::device(alloc), key_ptr, key_ptr + entry_count, idx_ptr);
31  }
32 }
33 
34 template <typename T>
35 void apply_permutation_on_gpu(T* val_buff,
36  int32_t* idx_buff,
37  const uint64_t entry_count,
38  ThrustAllocator& alloc) {
39  thrust::device_ptr<T> key_ptr(val_buff);
40  thrust::device_ptr<int32_t> idx_ptr(idx_buff);
41  const size_t buf_size = entry_count * sizeof(T);
42  T* raw_ptr = reinterpret_cast<T*>(alloc.allocate(buf_size));
43  thrust::device_ptr<T> tmp_ptr(raw_ptr);
44  thrust::copy(thrust::device(alloc), key_ptr, key_ptr + entry_count, tmp_ptr);
45  thrust::gather(thrust::device(alloc), idx_ptr, idx_ptr + entry_count, tmp_ptr, key_ptr);
46  alloc.deallocate(reinterpret_cast<int8_t*>(raw_ptr), buf_size);
47 }
48 
49 template <typename T>
50 void sort_on_cpu(T* val_buff,
51  int32_t* idx_buff,
52  const uint64_t entry_count,
53  const bool desc) {
54  thrust::sequence(idx_buff, idx_buff + entry_count);
55  if (desc) {
56  thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff, thrust::greater<T>());
57  } else {
58  thrust::sort_by_key(val_buff, val_buff + entry_count, idx_buff);
59  }
60 }
61 
62 template <typename T>
63 void apply_permutation_on_cpu(T* val_buff,
64  int32_t* idx_buff,
65  const uint64_t entry_count,
66  T* tmp_buff) {
67  thrust::copy(val_buff, val_buff + entry_count, tmp_buff);
68  thrust::gather(idx_buff, idx_buff + entry_count, tmp_buff, val_buff);
69 }
70 #endif
71 
72 void sort_on_gpu(int64_t* val_buff,
73  int32_t* idx_buff,
74  const uint64_t entry_count,
75  const bool desc,
76  const uint32_t chosen_bytes,
77  ThrustAllocator& alloc) {
78 #ifdef HAVE_CUDA
79  switch (chosen_bytes) {
80  case 1:
81  sort_on_gpu(
82  reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc, alloc);
83  break;
84  case 2:
85  sort_on_gpu(
86  reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc, alloc);
87  break;
88  case 4:
89  sort_on_gpu(
90  reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc, alloc);
91  break;
92  case 8:
93  sort_on_gpu(val_buff, idx_buff, entry_count, desc, alloc);
94  break;
95  default:
96  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
97  break;
98  }
99 #endif
100 }
101 
102 void sort_on_cpu(int64_t* val_buff,
103  int32_t* idx_buff,
104  const uint64_t entry_count,
105  const bool desc,
106  const uint32_t chosen_bytes) {
107 #ifdef HAVE_CUDA
108  switch (chosen_bytes) {
109  case 1:
110  sort_on_cpu(reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, desc);
111  break;
112  case 2:
113  sort_on_cpu(reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, desc);
114  break;
115  case 4:
116  sort_on_cpu(reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, desc);
117  break;
118  case 8:
119  sort_on_cpu(val_buff, idx_buff, entry_count, desc);
120  break;
121  default:
122  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
123  break;
124  }
125 #endif
126 }
127 
128 void apply_permutation_on_gpu(int64_t* val_buff,
129  int32_t* idx_buff,
130  const uint64_t entry_count,
131  const uint32_t chosen_bytes,
132  ThrustAllocator& alloc) {
133 #ifdef HAVE_CUDA
134  switch (chosen_bytes) {
135  case 1:
136  apply_permutation_on_gpu(
137  reinterpret_cast<int8_t*>(val_buff), idx_buff, entry_count, alloc);
138  break;
139  case 2:
140  apply_permutation_on_gpu(
141  reinterpret_cast<int16_t*>(val_buff), idx_buff, entry_count, alloc);
142  break;
143  case 4:
144  apply_permutation_on_gpu(
145  reinterpret_cast<int32_t*>(val_buff), idx_buff, entry_count, alloc);
146  break;
147  case 8:
148  apply_permutation_on_gpu(val_buff, idx_buff, entry_count, alloc);
149  break;
150  default:
151  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
152  break;
153  }
154 #endif
155 }
156 
157 void apply_permutation_on_cpu(int64_t* val_buff,
158  int32_t* idx_buff,
159  const uint64_t entry_count,
160  int64_t* tmp_buff,
161  const uint32_t chosen_bytes) {
162 #ifdef HAVE_CUDA
163  switch (chosen_bytes) {
164  case 1:
165  apply_permutation_on_cpu(reinterpret_cast<int8_t*>(val_buff),
166  idx_buff,
167  entry_count,
168  reinterpret_cast<int8_t*>(tmp_buff));
169  break;
170  case 2:
171  apply_permutation_on_cpu(reinterpret_cast<int16_t*>(val_buff),
172  idx_buff,
173  entry_count,
174  reinterpret_cast<int16_t*>(tmp_buff));
175  break;
176  case 4:
177  apply_permutation_on_cpu(reinterpret_cast<int32_t*>(val_buff),
178  idx_buff,
179  entry_count,
180  reinterpret_cast<int32_t*>(tmp_buff));
181  break;
182  case 8:
183  apply_permutation_on_cpu(val_buff, idx_buff, entry_count, tmp_buff);
184  break;
185  default:
186  // FIXME(miyu): CUDA linker doesn't accept assertion on GPU yet right now.
187  break;
188  }
189 #endif
190 }