OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TopKSort.cu File Reference
#include "BufferEntryUtils.h"
#include "GpuMemUtils.h"
#include "ResultSetBufferAccessors.h"
#include "SortUtils.cuh"
#include "StreamingTopN.h"
#include "TopKSort.h"
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/partition.h>
#include <thrust/sort.h>
#include <cuda.h>
#include <iostream>
+ Include dependency graph for TopKSort.cu:

Go to the source code of this file.

Classes

struct  is_taken_entry< K, I >
 
struct  is_null_order_entry< K, I >
 
struct  KeyFetcher< K, I >
 
struct  KeyReseter< K >
 
struct  RowFetcher< I >
 

Macros

#define checkCudaErrors(err)   CHECK_EQ(err, CUDA_SUCCESS)
 

Functions

CUstream getQueryEngineCudaStreamForDevice (int device_num)
 
template<typename ForwardIterator >
ForwardIterator partition_by_null (ForwardIterator first, ForwardIterator last, const int64_t null_val, const bool nulls_first, const int8_t *rows_ptr, const GroupByBufferLayoutInfo &layout)
 
template<class K , class I >
void collect_order_entry_column (thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator, const int device_id)
 
template<class K , class I >
void sort_indices_by_key (thrust::device_ptr< I > d_idx_first, const size_t idx_count, const thrust::device_ptr< K > &d_key_buffer, const bool desc, ThrustAllocator &allocator, const int device_id)
 
template<class I = int32_t>
void do_radix_sort (thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator, const int device_id)
 
template<typename DerivedPolicy >
void reset_keys_in_row_buffer (const thrust::detail::execution_policy_base< DerivedPolicy > &exec, int8_t *row_buffer, const size_t key_width, const size_t row_size, const size_t first, const size_t last)
 
std::vector< int8_t > pop_n_rows_from_merged_heaps_gpu (Data_Namespace::DataMgr *data_mgr, const int64_t *dev_heaps, const size_t heaps_size, const size_t n, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t group_key_bytes, const size_t thread_count, const int device_id)
 

Macro Definition Documentation

#define checkCudaErrors (   err)    CHECK_EQ(err, CUDA_SUCCESS)

Definition at line 39 of file TopKSort.cu.

Function Documentation

template<class K , class I >
void collect_order_entry_column ( thrust::device_ptr< K > &  d_oe_col_buffer,
const int8_t *  d_src_buffer,
const thrust::device_ptr< I > &  d_idx_first,
const size_t  idx_count,
const size_t  oe_offset,
const size_t  oe_stride,
ThrustAllocator allocator,
const int  device_id 
)

Definition at line 143 of file TopKSort.cu.

References checkCudaErrors, and getQueryEngineCudaStreamForDevice().

Referenced by do_radix_sort().

150  {
151  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
152  thrust::for_each(thrust::cuda::par(allocator).on(qe_cuda_stream),
153  thrust::make_counting_iterator(size_t(0)),
154  thrust::make_counting_iterator(idx_count),
155  KeyFetcher<K, I>(thrust::raw_pointer_cast(d_oe_col_buffer),
156  d_src_buffer + oe_offset,
157  oe_stride,
158  thrust::raw_pointer_cast(d_idx_first)));
159  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
160 }
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<class I = int32_t>
void do_radix_sort ( thrust::device_ptr< I >  d_idx_first,
const size_t  idx_count,
const int8_t *  d_src_buffer,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
ThrustAllocator allocator,
const int  device_id 
)

Definition at line 186 of file TopKSort.cu.

References CHECK, GroupByBufferLayoutInfo::col_bytes, GroupByBufferLayoutInfo::col_off, collect_order_entry_column(), PodOrderEntry::is_desc, GroupByBufferLayoutInfo::oe_target_info, GroupByBufferLayoutInfo::row_bytes, sort_indices_by_key(), and TargetInfo::sql_type.

Referenced by pop_n_rows_from_merged_heaps_gpu().

192  {
193  const auto& oe_type = layout.oe_target_info.sql_type;
194  if (oe_type.is_fp()) {
195  switch (layout.col_bytes) {
196  case 4: {
197  auto d_oe_buffer = get_device_ptr<float>(idx_count, allocator);
198  collect_order_entry_column(d_oe_buffer,
199  d_src_buffer,
200  d_idx_first,
201  idx_count,
202  layout.col_off,
203  layout.row_bytes,
204  allocator,
205  device_id);
207  d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
208  break;
209  }
210  case 8: {
211  auto d_oe_buffer = get_device_ptr<double>(idx_count, allocator);
212  collect_order_entry_column(d_oe_buffer,
213  d_src_buffer,
214  d_idx_first,
215  idx_count,
216  layout.col_off,
217  layout.row_bytes,
218  allocator,
219  device_id);
221  d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
222  break;
223  }
224  default:
225  CHECK(false);
226  }
227  return;
228  }
229  CHECK(oe_type.is_number() || oe_type.is_time());
230  switch (layout.col_bytes) {
231  case 4: {
232  auto d_oe_buffer = get_device_ptr<int32_t>(idx_count, allocator);
233  collect_order_entry_column(d_oe_buffer,
234  d_src_buffer,
235  d_idx_first,
236  idx_count,
237  layout.col_off,
238  layout.row_bytes,
239  allocator,
240  device_id);
242  d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
243  break;
244  }
245  case 8: {
246  auto d_oe_buffer = get_device_ptr<int64_t>(idx_count, allocator);
247  collect_order_entry_column(d_oe_buffer,
248  d_src_buffer,
249  d_idx_first,
250  idx_count,
251  layout.col_off,
252  layout.row_bytes,
253  allocator,
254  device_id);
256  d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
257  break;
258  }
259  default:
260  CHECK(false);
261  }
262 }
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
void collect_order_entry_column(thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:143
bool is_desc
const TargetInfo oe_target_info
#define CHECK(condition)
Definition: Logger.h:291
void sort_indices_by_key(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const thrust::device_ptr< K > &d_key_buffer, const bool desc, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:163

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CUstream getQueryEngineCudaStreamForDevice ( int  device_num)

Definition at line 7 of file QueryEngine.cpp.

References QueryEngine::getInstance().

Referenced by RangeJoinHashTable::approximateTupleCount(), OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), collect_order_entry_column(), anonymous_namespace{OverlapsJoinHashTable.cpp}::compute_bucket_sizes(), copy_projection_buffer_from_gpu_columnar(), copy_to_nvidia_gpu(), BaselineJoinHashTable::copyCpuHashTableToGpu(), PerfectJoinHashTable::copyCpuHashTableToGpu(), QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(), anonymous_namespace{ResultSetSortImpl.cu}::do_radix_sort(), TableFunctionExecutionContext::execute(), anonymous_namespace{ResultSetIteration.cpp}::fetch_data_from_gpu(), ResultSet::getVarlenOrderEntry(), BaselineJoinHashTable::initHashTableForDevice(), BaselineJoinHashTableBuilder::initHashTableOnGpu(), InValuesBitmap::InValuesBitmap(), TableFunctionExecutionContext::launchGpuCode(), ResultSet::makeVarlenTargetValue(), pop_n_rows_from_merged_heaps_gpu(), QueryExecutionContext::QueryExecutionContext(), ResultSet::radixSortOnGpu(), PerfectJoinHashTable::reify(), RangeJoinHashTable::reifyWithLayout(), OverlapsJoinHashTable::reifyWithLayout(), BaselineJoinHashTable::reifyWithLayout(), ExecutionKernel::runImpl(), sort_indices_by_key(), ResultSet::syncEstimatorBuffer(), PerfectJoinHashTable::toSet(), BaselineJoinHashTable::toSet(), OverlapsJoinHashTable::toSet(), PerfectJoinHashTable::toString(), BaselineJoinHashTable::toString(), and OverlapsJoinHashTable::toString().

8  { // NOTE: CUstream is cudaStream_t
9  return QueryEngine::getInstance()->getCudaStreamForDevice(device_num);
10 }
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:81

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename ForwardIterator >
ForwardIterator partition_by_null ( ForwardIterator  first,
ForwardIterator  last,
const int64_t  null_val,
const bool  nulls_first,
const int8_t *  rows_ptr,
const GroupByBufferLayoutInfo layout 
)

Definition at line 77 of file TopKSort.cu.

References GroupByBufferLayoutInfo::col_bytes, GroupByBufferLayoutInfo::col_off, and GroupByBufferLayoutInfo::row_bytes.

Referenced by pop_n_rows_from_merged_heaps_gpu().

82  {
83  if (nulls_first) {
84  return (layout.col_bytes == 4)
85  ? thrust::partition(
86  first,
87  last,
89  rows_ptr + layout.col_off, layout.row_bytes, null_val))
90  : thrust::partition(
91  first,
92  last,
93  is_null_order_entry<int64_t>(
94  rows_ptr + layout.col_off, layout.row_bytes, null_val));
95  } else {
96  return (layout.col_bytes == 4)
97  ? thrust::partition(
98  first,
99  last,
100  thrust::not1(is_null_order_entry<int32_t>(
101  rows_ptr + layout.col_off, layout.row_bytes, null_val)))
102  : thrust::partition(
103  first,
104  last,
105  thrust::not1(is_null_order_entry<int64_t>(
106  rows_ptr + layout.col_off, layout.row_bytes, null_val)));
107  }
108 }
Definition: TopKSort.cu:55

+ Here is the caller graph for this function:

std::vector<int8_t> pop_n_rows_from_merged_heaps_gpu ( Data_Namespace::DataMgr data_mgr,
const int64_t *  dev_heaps,
const size_t  heaps_size,
const size_t  n,
const PodOrderEntry oe,
const GroupByBufferLayoutInfo layout,
const size_t  group_key_bytes,
const size_t  thread_count,
const int  device_id 
)

Definition at line 309 of file TopKSort.cu.

References CHECK_EQ, checkCudaErrors, GroupByBufferLayoutInfo::col_bytes, gpu_enabled::copy(), do_radix_sort(), streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), getQueryEngineCudaStreamForDevice(), anonymous_namespace{Utm.h}::n, null_val_bit_pattern(), PodOrderEntry::nulls_first, GroupByBufferLayoutInfo::oe_target_info, partition_by_null(), reset_keys_in_row_buffer(), GroupByBufferLayoutInfo::row_bytes, generate_TableFunctionsFactory_init::separator, and TargetInfo::sql_type.

318  {
319  const auto row_size = layout.row_bytes;
320  CHECK_EQ(heaps_size, streaming_top_n::get_heap_size(row_size, n, thread_count));
321  const int8_t* rows_ptr = reinterpret_cast<const int8_t*>(dev_heaps) +
323  const auto total_entry_count = n * thread_count;
324  ThrustAllocator thrust_allocator(data_mgr, device_id);
325  auto d_indices = get_device_ptr<int32_t>(total_entry_count, thrust_allocator);
326  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
327  thrust::sequence(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
328  d_indices,
329  d_indices + total_entry_count);
330  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
331  auto separator =
332  (group_key_bytes == 4)
333  ? thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
334  d_indices,
335  d_indices + total_entry_count,
336  is_taken_entry<int32_t>(rows_ptr, row_size))
337  : thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
338  d_indices,
339  d_indices + total_entry_count,
340  is_taken_entry<int64_t>(rows_ptr, row_size));
341  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
342  const size_t actual_entry_count = separator - d_indices;
343  if (!actual_entry_count) {
344  std::vector<int8_t> top_rows(row_size * n);
346  thrust::host, &top_rows[0], layout.col_bytes, row_size, 0, n);
347  return top_rows;
348  }
349 
350  const auto& oe_type = layout.oe_target_info.sql_type;
351  if (oe_type.get_notnull()) {
353  d_indices, actual_entry_count, rows_ptr, oe, layout, thrust_allocator, device_id);
354  } else {
355  auto separator = partition_by_null(d_indices,
356  d_indices + actual_entry_count,
357  null_val_bit_pattern(oe_type, false),
358  oe.nulls_first,
359  rows_ptr,
360  layout);
361  if (oe.nulls_first) {
362  const size_t null_count = separator - d_indices;
363  if (null_count < actual_entry_count) {
365  actual_entry_count - null_count,
366  rows_ptr,
367  oe,
368  layout,
369  thrust_allocator,
370  device_id);
371  }
372  } else {
373  const size_t nonnull_count = separator - d_indices;
374  if (nonnull_count > 0) {
376  d_indices, nonnull_count, rows_ptr, oe, layout, thrust_allocator, device_id);
377  }
378  }
379  }
380 
381  const auto final_entry_count = std::min(n, actual_entry_count);
382  auto d_top_rows = get_device_ptr<int8_t>(row_size * n, thrust_allocator);
383  thrust::for_each(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
384  thrust::make_counting_iterator(size_t(0)),
385  thrust::make_counting_iterator(final_entry_count),
386  RowFetcher<int32_t>(thrust::raw_pointer_cast(d_top_rows),
387  rows_ptr,
388  thrust::raw_pointer_cast(d_indices),
389  row_size));
390  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
391 
392  if (final_entry_count < n) {
393  reset_keys_in_row_buffer(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
394  thrust::raw_pointer_cast(d_top_rows),
395  layout.col_bytes,
396  row_size,
397  final_entry_count,
398  n);
399  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
400  }
401 
402  std::vector<int8_t> top_rows(row_size * n);
403  thrust::copy(d_top_rows, d_top_rows + row_size * n, top_rows.begin());
404  return top_rows;
405 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void reset_keys_in_row_buffer(const thrust::detail::execution_policy_base< DerivedPolicy > &exec, int8_t *row_buffer, const size_t key_width, const size_t row_size, const size_t first, const size_t last)
Definition: TopKSort.cu:282
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
Definition: TopKSort.cu:44
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
bool nulls_first
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
void do_radix_sort(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:186
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
const TargetInfo oe_target_info
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9
constexpr double n
Definition: Utm.h:38
ForwardIterator partition_by_null(ForwardIterator first, ForwardIterator last, const int64_t null_val, const bool nulls_first, const int8_t *rows_ptr, const GroupByBufferLayoutInfo &layout)
Definition: TopKSort.cu:77

+ Here is the call graph for this function:

template<typename DerivedPolicy >
void reset_keys_in_row_buffer ( const thrust::detail::execution_policy_base< DerivedPolicy > &  exec,
int8_t *  row_buffer,
const size_t  key_width,
const size_t  row_size,
const size_t  first,
const size_t  last 
)

Definition at line 282 of file TopKSort.cu.

References CHECK, EMPTY_KEY_32, and EMPTY_KEY_64.

Referenced by pop_n_rows_from_merged_heaps_gpu().

288  {
289  switch (key_width) {
290  case 4:
291  thrust::for_each(
292  exec,
293  thrust::make_counting_iterator(first),
294  thrust::make_counting_iterator(last),
295  KeyReseter<int32_t>(row_buffer, row_size, static_cast<int32_t>(EMPTY_KEY_32)));
296  break;
297  case 8:
298  thrust::for_each(
299  exec,
300  thrust::make_counting_iterator(first),
301  thrust::make_counting_iterator(last),
302  KeyReseter<int64_t>(row_buffer, row_size, static_cast<int64_t>(EMPTY_KEY_64)));
303  break;
304  default:
305  CHECK(false);
306  }
307 }
#define EMPTY_KEY_64
#define CHECK(condition)
Definition: Logger.h:291
#define EMPTY_KEY_32

+ Here is the caller graph for this function:

template<class K , class I >
void sort_indices_by_key ( thrust::device_ptr< I >  d_idx_first,
const size_t  idx_count,
const thrust::device_ptr< K > &  d_key_buffer,
const bool  desc,
ThrustAllocator allocator,
const int  device_id 
)

Definition at line 163 of file TopKSort.cu.

References checkCudaErrors, and getQueryEngineCudaStreamForDevice().

Referenced by do_radix_sort().

168  {
169  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
170  if (desc) {
171  thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),
172  d_key_buffer,
173  d_key_buffer + idx_count,
174  d_idx_first,
175  thrust::greater<K>());
176  } else {
177  thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),
178  d_key_buffer,
179  d_key_buffer + idx_count,
180  d_idx_first);
181  }
182  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
183 }
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

+ Here is the call graph for this function:

+ Here is the caller graph for this function: