OmniSciDB  b24e664e58
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
anonymous_namespace{ResultSetSortImpl.cu} Namespace Reference

Functions

template<class K , class V , class I >
std::vector< uint32_t > do_radix_sort (const ExecutorDeviceType device_type, ThrustAllocator &thrust_allocator, const int8_t *groupby_buffer, V dev_oe_col_buffer_begin, V dev_oe_col_buffer_end, I dev_idx_buff_begin, const size_t dev_idx_buff_size, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n)
 
void add_nulls (std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
 
template<typename T >
thrust::device_ptr< T > get_device_copy_ptr (const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
 
template<class K >
std::vector< uint32_t > baseline_sort_fp (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
 
template<class K >
std::vector< uint32_t > baseline_sort_int (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
 
template<class K >
thrust::host_vector< int64_t > collect_order_entry_column (const int8_t *groupby_buffer, const GroupByBufferLayoutInfo &layout, const size_t start, const size_t step)
 

Function Documentation

void anonymous_namespace{ResultSetSortImpl.cu}::add_nulls ( std::vector< uint32_t > &  idx_buff,
const std::vector< uint32_t > &  null_idx_buff,
const PodOrderEntry oe 
)

Definition at line 87 of file ResultSetSortImpl.cu.

References PodOrderEntry::nulls_first.

Referenced by baseline_sort_fp(), and baseline_sort_int().

89  {
90  if (null_idx_buff.empty()) {
91  return;
92  }
93  const auto insertion_point = oe.nulls_first ? idx_buff.begin() : idx_buff.end();
94  idx_buff.insert(insertion_point, null_idx_buff.begin(), null_idx_buff.end());
95 }
bool nulls_first

+ Here is the caller graph for this function:

template<class K >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_fp ( const ExecutorDeviceType  device_type,
const int  device_id,
Data_Namespace::DataMgr data_mgr,
const int8_t *  groupby_buffer,
const thrust::host_vector< int64_t > &  oe_col_buffer,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n,
const size_t  start,
const size_t  step 
)

Definition at line 115 of file ResultSetSortImpl.cu.

References add_nulls(), TargetInfo::agg_kind, CHECK(), CPU, GroupByBufferLayoutInfo::entry_count, get_device_copy_ptr(), GPU, PodOrderEntry::is_desc, kAVG, kDOUBLE, null_val_bit_pattern(), PodOrderEntry::nulls_first, GroupByBufferLayoutInfo::oe_target_info, GroupByBufferLayoutInfo::row_bytes, takes_float_argument(), and PodOrderEntry::tle_no.

124  {
125  thrust::host_vector<uint32_t> neg_idx_buff;
126  thrust::host_vector<uint32_t> pos_idx_buff;
127  std::vector<uint32_t> null_idx_buff;
128  thrust::host_vector<int64_t> neg_oe_col_buffer;
129  thrust::host_vector<int64_t> pos_oe_col_buffer;
130  const auto slice_entry_count =
131  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
132  neg_idx_buff.reserve(slice_entry_count);
133  pos_idx_buff.reserve(slice_entry_count);
134  null_idx_buff.reserve(slice_entry_count);
135  neg_oe_col_buffer.reserve(slice_entry_count);
136  pos_oe_col_buffer.reserve(slice_entry_count);
137  size_t oe_col_buffer_idx = 0;
138  const auto& oe_info = layout.oe_target_info;
139  const auto col_ti =
140  oe_info.agg_kind == kAVG ? SQLTypeInfo(kDOUBLE, false) : oe_info.sql_type;
141  // Execlude AVG b/c collect_order_entry_column already makes its pair collapse into a
142  // double
143  const bool float_argument_input =
144  takes_float_argument(oe_info) && oe_info.agg_kind != kAVG;
145 
146  auto is_negative =
147  float_argument_input ? [](const int64_t v) -> bool { return (v & (1 << 31)) != 0; }
148  : [](const int64_t v) -> bool { return v < 0; };
149 
150  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
151  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
152  oe_col_buffer[oe_col_buffer_idx] ==
153  null_val_bit_pattern(col_ti, float_argument_input)) {
154  null_idx_buff.push_back(i);
155  continue;
156  }
157  if (is_negative(oe_col_buffer[oe_col_buffer_idx])) { // sign bit works the same for
158  // integer and floating point
159  neg_idx_buff.push_back(i);
160  neg_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
161  } else {
162  pos_idx_buff.push_back(i);
163  pos_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
164  }
165  }
166  std::vector<uint32_t> pos_result;
167  ThrustAllocator thrust_allocator(data_mgr, device_id);
168  if (device_type == ExecutorDeviceType::GPU) {
169  const auto dev_pos_idx_buff = get_device_copy_ptr(pos_idx_buff, thrust_allocator);
170  const auto dev_pos_oe_col_buffer =
171  get_device_copy_ptr(pos_oe_col_buffer, thrust_allocator);
172  pos_result = do_radix_sort<K>(device_type,
173  thrust_allocator,
174  groupby_buffer,
175  dev_pos_oe_col_buffer,
176  dev_pos_oe_col_buffer + pos_oe_col_buffer.size(),
177  dev_pos_idx_buff,
178  pos_idx_buff.size(),
179  oe,
180  layout,
181  top_n);
182  } else {
183  CHECK(device_type == ExecutorDeviceType::CPU);
184  pos_result = do_radix_sort<K>(device_type,
185  thrust_allocator,
186  groupby_buffer,
187  pos_oe_col_buffer.begin(),
188  pos_oe_col_buffer.end(),
189  pos_idx_buff.begin(),
190  pos_idx_buff.size(),
191  oe,
192  layout,
193  top_n);
194  }
195  std::vector<uint32_t> neg_result;
196  PodOrderEntry reverse_oe{oe.tle_no, !oe.is_desc, oe.nulls_first};
197  if (device_type == ExecutorDeviceType::GPU) {
198  const auto dev_neg_idx_buff = get_device_copy_ptr(neg_idx_buff, thrust_allocator);
199  const auto dev_neg_oe_col_buffer =
200  get_device_copy_ptr(neg_oe_col_buffer, thrust_allocator);
201  neg_result = do_radix_sort<K>(device_type,
202  thrust_allocator,
203  groupby_buffer,
204  dev_neg_oe_col_buffer,
205  dev_neg_oe_col_buffer + neg_oe_col_buffer.size(),
206  dev_neg_idx_buff,
207  neg_idx_buff.size(),
208  reverse_oe,
209  layout,
210  top_n);
211  } else {
212  CHECK(device_type == ExecutorDeviceType::CPU);
213  neg_result = do_radix_sort<K>(device_type,
214  thrust_allocator,
215  groupby_buffer,
216  neg_oe_col_buffer.begin(),
217  neg_oe_col_buffer.end(),
218  neg_idx_buff.begin(),
219  neg_idx_buff.size(),
220  reverse_oe,
221  layout,
222  top_n);
223  }
224  if (oe.is_desc) {
225  pos_result.insert(pos_result.end(), neg_result.begin(), neg_result.end());
226  add_nulls(pos_result, null_idx_buff, oe);
227  return pos_result;
228  }
229  neg_result.insert(neg_result.end(), pos_result.begin(), pos_result.end());
230  add_nulls(neg_result, null_idx_buff, oe);
231  return neg_result;
232 }
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:120
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
bool nulls_first
CHECK(cgen_state)
SQLTypeInfoCore< ArrayContextTypeSizer, ExecutorTypePackaging, DateTimeFacilities > SQLTypeInfo
Definition: sqltypes.h:852
SQLAgg agg_kind
Definition: TargetInfo.h:41
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
int tle_no
bool is_desc
const TargetInfo oe_target_info
Definition: sqldefs.h:71

+ Here is the call graph for this function:

template<class K >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_int ( const ExecutorDeviceType  device_type,
const int  device_id,
Data_Namespace::DataMgr data_mgr,
const int8_t *  groupby_buffer,
const thrust::host_vector< int64_t > &  oe_col_buffer,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n,
const size_t  start,
const size_t  step 
)

Definition at line 235 of file ResultSetSortImpl.cu.

References add_nulls(), CHECK(), CPU, GroupByBufferLayoutInfo::entry_count, get_compact_type(), get_device_copy_ptr(), GPU, null_val_bit_pattern(), GroupByBufferLayoutInfo::oe_target_info, and GroupByBufferLayoutInfo::row_bytes.

244  {
245  const auto& entry_ti = get_compact_type(layout.oe_target_info);
246  std::vector<uint32_t> null_idx_buff;
247  thrust::host_vector<uint32_t> notnull_idx_buff;
248  const auto slice_entry_count =
249  layout.entry_count / step + (layout.entry_count % step ? 1 : 0);
250  null_idx_buff.reserve(slice_entry_count);
251  notnull_idx_buff.reserve(slice_entry_count);
252  thrust::host_vector<int64_t> notnull_oe_col_buffer;
253  notnull_oe_col_buffer.reserve(slice_entry_count);
254  size_t oe_col_buffer_idx = 0;
255  for (size_t i = start; i < layout.entry_count; i += step, ++oe_col_buffer_idx) {
256  if (!is_empty_entry<K>(i, groupby_buffer, layout.row_bytes) &&
257  oe_col_buffer[oe_col_buffer_idx] == null_val_bit_pattern(entry_ti, false)) {
258  null_idx_buff.push_back(i);
259  } else {
260  notnull_idx_buff.push_back(i);
261  notnull_oe_col_buffer.push_back(oe_col_buffer[oe_col_buffer_idx]);
262  }
263  }
264  std::vector<uint32_t> notnull_result;
265  ThrustAllocator thrust_allocator(data_mgr, device_id);
266  if (device_type == ExecutorDeviceType::GPU) {
267  const auto dev_notnull_idx_buff =
268  get_device_copy_ptr(notnull_idx_buff, thrust_allocator);
269  const auto dev_notnull_oe_col_buffer =
270  get_device_copy_ptr(notnull_oe_col_buffer, thrust_allocator);
271  notnull_result =
272  do_radix_sort<K>(device_type,
273  thrust_allocator,
274  groupby_buffer,
275  dev_notnull_oe_col_buffer,
276  dev_notnull_oe_col_buffer + notnull_oe_col_buffer.size(),
277  dev_notnull_idx_buff,
278  notnull_idx_buff.size(),
279  oe,
280  layout,
281  top_n);
282  } else {
283  CHECK(device_type == ExecutorDeviceType::CPU);
284  notnull_result = do_radix_sort<K>(device_type,
285  thrust_allocator,
286  groupby_buffer,
287  notnull_oe_col_buffer.begin(),
288  notnull_oe_col_buffer.end(),
289  notnull_idx_buff.begin(),
290  notnull_idx_buff.size(),
291  oe,
292  layout,
293  top_n);
294  }
295  add_nulls(notnull_result, null_idx_buff, oe);
296  return notnull_result;
297 }
thrust::device_ptr< T > get_device_copy_ptr(const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
const SQLTypeInfo get_compact_type(const TargetInfo &target)
CHECK(cgen_state)
void add_nulls(std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)
const TargetInfo oe_target_info

+ Here is the call graph for this function:

template<class K >
thrust::host_vector<int64_t> anonymous_namespace{ResultSetSortImpl.cu}::collect_order_entry_column ( const int8_t *  groupby_buffer,
const GroupByBufferLayoutInfo layout,
const size_t  start,
const size_t  step 
)

Definition at line 300 of file ResultSetSortImpl.cu.

References GroupByBufferLayoutInfo::col_off, GroupByBufferLayoutInfo::row_bytes, and GroupByBufferLayoutInfo::target_groupby_index.

304  {
305  thrust::host_vector<int64_t> oe_col_buffer;
306  const auto row_ptr = groupby_buffer + start * layout.row_bytes;
307  auto crt_group_ptr1 = layout.target_groupby_index >= 0
308  ? row_ptr + layout.target_groupby_index * sizeof(K)
309  : row_ptr + layout.col_off;
310  const int8_t* crt_group_ptr2{nullptr};
311  if (layout.oe_target_info.agg_kind == kAVG) {
312  crt_group_ptr2 = crt_group_ptr1 + layout.col_bytes;
313  }
314  const auto entry_ti = get_compact_type(layout.oe_target_info);
315  const bool float_argument_input = takes_float_argument(layout.oe_target_info);
316  const auto step_bytes = layout.row_bytes * step;
317  for (size_t i = start; i < layout.entry_count; i += step) {
318  auto val1 = read_int_from_buff(crt_group_ptr1,
319  layout.col_bytes > 0 ? layout.col_bytes : sizeof(K));
320  if (crt_group_ptr2) {
321  const auto val2 = read_int_from_buff(crt_group_ptr2, 8);
322  const auto avg_val = pair_to_double({val1, val2}, entry_ti, float_argument_input);
323  val1 = *reinterpret_cast<const int64_t*>(&avg_val);
324  }
325  oe_col_buffer.push_back(val1);
326  crt_group_ptr1 += step_bytes;
327  if (crt_group_ptr2) {
328  crt_group_ptr2 += step_bytes;
329  }
330  }
331  return oe_col_buffer;
332 }
const ssize_t target_groupby_index
int64_t read_int_from_buff(const int8_t *ptr, const int8_t compact_sz)
double pair_to_double(const std::pair< int64_t, int64_t > &fp_pair, const SQLTypeInfo &ti, const bool float_argument_input)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:120
const SQLTypeInfo get_compact_type(const TargetInfo &target)
SQLAgg agg_kind
Definition: TargetInfo.h:41
const TargetInfo oe_target_info
Definition: sqldefs.h:71
template<class K , class V , class I >
std::vector<uint32_t> anonymous_namespace{ResultSetSortImpl.cu}::do_radix_sort ( const ExecutorDeviceType  device_type,
ThrustAllocator thrust_allocator,
const int8_t *  groupby_buffer,
dev_oe_col_buffer_begin,
dev_oe_col_buffer_end,
dev_idx_buff_begin,
const size_t  dev_idx_buff_size,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  top_n 
)

Definition at line 20 of file ResultSetSortImpl.cu.

References GPU, PodOrderEntry::is_desc, run_benchmark_import::result, and GroupByBufferLayoutInfo::row_bytes.

29  {
30  if (dev_idx_buff_size == 0) {
31  return {};
32  }
33  if (oe.is_desc) {
34  if (device_type == ExecutorDeviceType::GPU) {
35  thrust::sort_by_key(thrust::device(thrust_allocator),
36  dev_oe_col_buffer_begin,
37  dev_oe_col_buffer_end,
38  dev_idx_buff_begin,
39  thrust::greater<int64_t>());
40  } else {
41  thrust::sort_by_key(dev_oe_col_buffer_begin,
42  dev_oe_col_buffer_end,
43  dev_idx_buff_begin,
44  thrust::greater<int64_t>());
45  }
46  } else {
47  if (device_type == ExecutorDeviceType::GPU) {
48  thrust::sort_by_key(thrust::device(thrust_allocator),
49  dev_oe_col_buffer_begin,
50  dev_oe_col_buffer_end,
51  dev_idx_buff_begin);
52  } else {
53  thrust::sort_by_key(
54  dev_oe_col_buffer_begin, dev_oe_col_buffer_end, dev_idx_buff_begin);
55  }
56  }
57  // Speculatively transfer only the top_n first, most of the time it'll be enough.
58  thrust::host_vector<uint32_t> host_vector_result(
59  dev_idx_buff_begin, dev_idx_buff_begin + std::min(top_n, dev_idx_buff_size));
60  // Sometimes, radix sort can bring to the front entries which are empty.
61  // For example, ascending sort on COUNT(*) will bring non-existent groups
62  // to the front of dev_idx_buff since they're 0 in our system. Re-do the
63  // transfer in that case to bring the entire dev_idx_buff; existing logic
64  // in row iteration will take care of skipping the empty rows.
65  for (size_t i = 0; i < host_vector_result.size(); ++i) {
66  const auto entry_idx = host_vector_result[i];
67  if (is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
68  host_vector_result = thrust::host_vector<uint32_t>(
69  dev_idx_buff_begin, dev_idx_buff_begin + dev_idx_buff_size);
70  break;
71  }
72  }
73  std::vector<uint32_t> result;
74  result.reserve(std::min(top_n, host_vector_result.size()));
75  for (size_t i = 0; i < host_vector_result.size(); ++i) {
76  const auto entry_idx = host_vector_result[i];
77  if (!is_empty_entry<K>(entry_idx, groupby_buffer, layout.row_bytes)) {
78  result.push_back(entry_idx);
79  if (result.size() >= top_n) {
80  break;
81  }
82  }
83  }
84  return result;
85 }
bool is_desc
template<typename T >
thrust::device_ptr<T> anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr ( const thrust::host_vector< T > &  host_vec,
ThrustAllocator thrust_allocator 
)

Definition at line 98 of file ResultSetSortImpl.cu.

References align_to_int64(), ThrustAllocator::allocateScopedBuffer(), copy_to_gpu(), ThrustAllocator::getDataMgr(), and ThrustAllocator::getDeviceId().

Referenced by baseline_sort(), baseline_sort_fp(), and baseline_sort_int().

99  {
100  if (host_vec.empty()) {
101  return thrust::device_ptr<T>(static_cast<T*>(nullptr));
102  }
103  const auto host_vec_bytes = host_vec.size() * sizeof(T);
104  T* dev_ptr = reinterpret_cast<T*>(
105  thrust_allocator.allocateScopedBuffer(align_to_int64(host_vec_bytes)));
106  copy_to_gpu(thrust_allocator.getDataMgr(),
107  reinterpret_cast<CUdeviceptr>(dev_ptr),
108  &host_vec[0],
109  host_vec_bytes,
110  thrust_allocator.getDeviceId());
111  return thrust::device_ptr<T>(dev_ptr);
112 }
int getDeviceId() const
unsigned long long CUdeviceptr
Definition: nocuda.h:27
Data_Namespace::DataMgr * getDataMgr() const
int8_t * allocateScopedBuffer(std::ptrdiff_t num_bytes)
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:31
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function: