#include "BufferEntryUtils.h"
#include "GpuMemUtils.h"
#include "ResultSetBufferAccessors.h"
#include "SortUtils.cuh"
#include "StreamingTopN.h"
#include "TopKSort.h"
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/functional.h>
#include <thrust/partition.h>
#include <thrust/sort.h>
#include <cuda.h>
#include <iostream>

Include dependency graph for TopKSort.cu:

Classes
struct	is_taken_entry< K, I >

struct	is_null_order_entry< K, I >

struct	KeyFetcher< K, I >

struct	KeyReseter< K >

struct	RowFetcher< I >

Macros
#define	checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)

Functions
CUstream	getQueryEngineCudaStreamForDevice (int device_num)

template<typename ForwardIterator >
ForwardIterator	partition_by_null (ForwardIterator first, ForwardIterator last, const int64_t null_val, const bool nulls_first, const int8_t *rows_ptr, const GroupByBufferLayoutInfo &layout)

template<class K , class I >
void	collect_order_entry_column (thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator, const int device_id)

template<class K , class I >
void	sort_indices_by_key (thrust::device_ptr< I > d_idx_first, const size_t idx_count, const thrust::device_ptr< K > &d_key_buffer, const bool desc, ThrustAllocator &allocator, const int device_id)

template<class I = int32_t>
void	do_radix_sort (thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator, const int device_id)

template<typename DerivedPolicy >
void	reset_keys_in_row_buffer (const thrust::detail::execution_policy_base< DerivedPolicy > &exec, int8_t *row_buffer, const size_t key_width, const size_t row_size, const size_t first, const size_t last)

std::vector< int8_t >	pop_n_rows_from_merged_heaps_gpu (Data_Namespace::DataMgr data_mgr, const int64_t dev_heaps, const size_t heaps_size, const size_t n, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t group_key_bytes, const size_t thread_count, const int device_id)

Macro Definition Documentation

#define checkCudaErrors ( err ) CHECK_EQ(err, CUDA_SUCCESS)

Definition at line 33 of file TopKSort.cu.

Function Documentation

template<class K , class I >

void collect_order_entry_column	(	thrust::device_ptr< K > &	d_oe_col_buffer,
		const int8_t *	d_src_buffer,
		const thrust::device_ptr< I > &	d_idx_first,
		const size_t	idx_count,
		const size_t	oe_offset,
		const size_t	oe_stride,
		ThrustAllocator &	allocator,
		const int	device_id
	)

Definition at line 137 of file TopKSort.cu.

References checkCudaErrors, and getQueryEngineCudaStreamForDevice().

Referenced by do_radix_sort().

                                                      {
   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
   thrust::for_each(thrust::cuda::par(allocator).on(qe_cuda_stream),
                    thrust::make_counting_iterator(size_t(0)),
                    thrust::make_counting_iterator(idx_count),
                    KeyFetcher<K, I>(thrust::raw_pointer_cast(d_oe_col_buffer),
                                     d_src_buffer + oe_offset,
                                     oe_stride,
                                     thrust::raw_pointer_cast(d_idx_first)));
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<class I = int32_t>

void do_radix_sort	(	thrust::device_ptr< I >	d_idx_first,
		const size_t	idx_count,
		const int8_t *	d_src_buffer,
		const PodOrderEntry &	oe,
		const GroupByBufferLayoutInfo &	layout,
		ThrustAllocator &	allocator,
		const int	device_id
	)

Definition at line 180 of file TopKSort.cu.

References CHECK, GroupByBufferLayoutInfo::col_bytes, GroupByBufferLayoutInfo::col_off, collect_order_entry_column(), PodOrderEntry::is_desc, GroupByBufferLayoutInfo::oe_target_info, GroupByBufferLayoutInfo::row_bytes, sort_indices_by_key(), and TargetInfo::sql_type.

Referenced by pop_n_rows_from_merged_heaps_gpu().

                                         {
   const auto& oe_type = layout.oe_target_info.sql_type;
   if (oe_type.is_fp()) {
     switch (layout.col_bytes) {
       case 4: {
         auto d_oe_buffer = get_device_ptr<float>(idx_count, allocator);
         collect_order_entry_column(d_oe_buffer,
                                    d_src_buffer,
                                    d_idx_first,
                                    idx_count,
                                    layout.col_off,
                                    layout.row_bytes,
                                    allocator,
                                    device_id);
         sort_indices_by_key(
             d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
         break;
       }
       case 8: {
         auto d_oe_buffer = get_device_ptr<double>(idx_count, allocator);
         collect_order_entry_column(d_oe_buffer,
                                    d_src_buffer,
                                    d_idx_first,
                                    idx_count,
                                    layout.col_off,
                                    layout.row_bytes,
                                    allocator,
                                    device_id);
         sort_indices_by_key(
             d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
         break;
       }
       default:
         CHECK(false);
     }
     return;
   }
   CHECK(oe_type.is_number() || oe_type.is_time());
   switch (layout.col_bytes) {
     case 4: {
       auto d_oe_buffer = get_device_ptr<int32_t>(idx_count, allocator);
       collect_order_entry_column(d_oe_buffer,
                                  d_src_buffer,
                                  d_idx_first,
                                  idx_count,
                                  layout.col_off,
                                  layout.row_bytes,
                                  allocator,
                                  device_id);
       sort_indices_by_key(
           d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
       break;
     }
     case 8: {
       auto d_oe_buffer = get_device_ptr<int64_t>(idx_count, allocator);
       collect_order_entry_column(d_oe_buffer,
                                  d_src_buffer,
                                  d_idx_first,
                                  idx_count,
                                  layout.col_off,
                                  layout.row_bytes,
                                  allocator,
                                  device_id);
       sort_indices_by_key(
           d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);
       break;
     }
     default:
       CHECK(false);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

CUstream getQueryEngineCudaStreamForDevice ( int device_num )

Definition at line 7 of file QueryEngine.cpp.

References QueryEngine::getInstance().

Referenced by RangeJoinHashTable::approximateTupleCount(), BoundingBoxIntersectJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), collect_order_entry_column(), anonymous_namespace{BoundingBoxIntersectJoinHashTable.cpp}::compute_bucket_sizes(), copy_projection_buffer_from_gpu_columnar(), copy_to_nvidia_gpu(), BaselineJoinHashTable::copyCpuHashTableToGpu(), PerfectJoinHashTable::copyCpuHashTableToGpu(), QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(), anonymous_namespace{ResultSetSortImpl.cu}::do_radix_sort(), TableFunctionExecutionContext::execute(), anonymous_namespace{ResultSetIteration.cpp}::fetch_data_from_gpu(), ResultSet::getVarlenOrderEntry(), BaselineJoinHashTable::initHashTableForDevice(), BaselineJoinHashTableBuilder::initHashTableOnGpu(), InValuesBitmap::InValuesBitmap(), TableFunctionExecutionContext::launchGpuCode(), ResultSet::makeVarlenTargetValue(), pop_n_rows_from_merged_heaps_gpu(), QueryExecutionContext::QueryExecutionContext(), ResultSet::radixSortOnGpu(), PerfectJoinHashTable::reify(), RangeJoinHashTable::reifyWithLayout(), BoundingBoxIntersectJoinHashTable::reifyWithLayout(), BaselineJoinHashTable::reifyWithLayout(), ExecutionKernel::runImpl(), sort_indices_by_key(), ResultSet::syncEstimatorBuffer(), PerfectJoinHashTable::toSet(), BaselineJoinHashTable::toSet(), BoundingBoxIntersectJoinHashTable::toSet(), PerfectJoinHashTable::toString(), BaselineJoinHashTable::toString(), and BoundingBoxIntersectJoinHashTable::toString().

                     {  // NOTE: CUstream is cudaStream_t
   return QueryEngine::getInstance()->getCudaStreamForDevice(device_num);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename ForwardIterator >

ForwardIterator partition_by_null	(	ForwardIterator	first,
		ForwardIterator	last,
		const int64_t	null_val,
		const bool	nulls_first,
		const int8_t *	rows_ptr,
		const GroupByBufferLayoutInfo &	layout
	)

Definition at line 71 of file TopKSort.cu.

References GroupByBufferLayoutInfo::col_bytes, GroupByBufferLayoutInfo::col_off, and GroupByBufferLayoutInfo::row_bytes.

Referenced by pop_n_rows_from_merged_heaps_gpu().

                                                                          {
   if (nulls_first) {
     return (layout.col_bytes == 4)
                ? thrust::partition(
                      first,
                      last,
                      is_null_order_entry<int32_t>(
                          rows_ptr + layout.col_off, layout.row_bytes, null_val))
                : thrust::partition(
                      first,
                      last,
                      is_null_order_entry<int64_t>(
                          rows_ptr + layout.col_off, layout.row_bytes, null_val));
   } else {
     return (layout.col_bytes == 4)
                ? thrust::partition(
                      first,
                      last,
                      thrust::not1(is_null_order_entry<int32_t>(
                          rows_ptr + layout.col_off, layout.row_bytes, null_val)))
                : thrust::partition(
                      first,
                      last,
                      thrust::not1(is_null_order_entry<int64_t>(
                          rows_ptr + layout.col_off, layout.row_bytes, null_val)));
   }
 }

Here is the caller graph for this function:

std::vector<int8_t> pop_n_rows_from_merged_heaps_gpu	(	Data_Namespace::DataMgr *	data_mgr,
		const int64_t *	dev_heaps,
		const size_t	heaps_size,
		const size_t	n,
		const PodOrderEntry &	oe,
		const GroupByBufferLayoutInfo &	layout,
		const size_t	group_key_bytes,
		const size_t	thread_count,
		const int	device_id
	)

Definition at line 303 of file TopKSort.cu.

References CHECK_EQ, checkCudaErrors, GroupByBufferLayoutInfo::col_bytes, gpu_enabled::copy(), do_radix_sort(), streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), getQueryEngineCudaStreamForDevice(), anonymous_namespace{Utm.h}::n, null_val_bit_pattern(), PodOrderEntry::nulls_first, GroupByBufferLayoutInfo::oe_target_info, partition_by_null(), reset_keys_in_row_buffer(), GroupByBufferLayoutInfo::row_bytes, generate_TableFunctionsFactory_init::separator, and TargetInfo::sql_type.

                          {
   const auto row_size = layout.row_bytes;
   CHECK_EQ(heaps_size, streaming_top_n::get_heap_size(row_size, n, thread_count));
   const int8_t* rows_ptr = reinterpret_cast<const int8_t*>(dev_heaps) +
                            streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
   const auto total_entry_count = n * thread_count;
   ThrustAllocator thrust_allocator(data_mgr, device_id);
   auto d_indices = get_device_ptr<int32_t>(total_entry_count, thrust_allocator);
   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
   thrust::sequence(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
                    d_indices,
                    d_indices + total_entry_count);
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
   auto separator =
       (group_key_bytes == 4)
           ? thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
                               d_indices,
                               d_indices + total_entry_count,
                               is_taken_entry<int32_t>(rows_ptr, row_size))
           : thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
                               d_indices,
                               d_indices + total_entry_count,
                               is_taken_entry<int64_t>(rows_ptr, row_size));
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
   const size_t actual_entry_count = separator - d_indices;
   if (!actual_entry_count) {
     std::vector<int8_t> top_rows(row_size * n);
     reset_keys_in_row_buffer(
         thrust::host, &top_rows[0], layout.col_bytes, row_size, 0, n);
     return top_rows;
   }
 
   const auto& oe_type = layout.oe_target_info.sql_type;
   if (oe_type.get_notnull()) {
     do_radix_sort(
         d_indices, actual_entry_count, rows_ptr, oe, layout, thrust_allocator, device_id);
   } else {
     auto separator = partition_by_null(d_indices,
                                        d_indices + actual_entry_count,
                                        null_val_bit_pattern(oe_type, false),
                                        oe.nulls_first,
                                        rows_ptr,
                                        layout);
     if (oe.nulls_first) {
       const size_t null_count = separator - d_indices;
       if (null_count < actual_entry_count) {
         do_radix_sort(separator,
                       actual_entry_count - null_count,
                       rows_ptr,
                       oe,
                       layout,
                       thrust_allocator,
                       device_id);
       }
     } else {
       const size_t nonnull_count = separator - d_indices;
       if (nonnull_count > 0) {
         do_radix_sort(
             d_indices, nonnull_count, rows_ptr, oe, layout, thrust_allocator, device_id);
       }
     }
   }
 
   const auto final_entry_count = std::min(n, actual_entry_count);
   auto d_top_rows = get_device_ptr<int8_t>(row_size * n, thrust_allocator);
   thrust::for_each(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
                    thrust::make_counting_iterator(size_t(0)),
                    thrust::make_counting_iterator(final_entry_count),
                    RowFetcher<int32_t>(thrust::raw_pointer_cast(d_top_rows),
                                        rows_ptr,
                                        thrust::raw_pointer_cast(d_indices),
                                        row_size));
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
 
   if (final_entry_count < n) {
     reset_keys_in_row_buffer(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),
                              thrust::raw_pointer_cast(d_top_rows),
                              layout.col_bytes,
                              row_size,
                              final_entry_count,
                              n);
     checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
   }
 
   std::vector<int8_t> top_rows(row_size * n);
   thrust::copy(d_top_rows, d_top_rows + row_size * n, top_rows.begin());
   return top_rows;
 }

Here is the call graph for this function:

template<typename DerivedPolicy >

void reset_keys_in_row_buffer	(	const thrust::detail::execution_policy_base< DerivedPolicy > &	exec,
		int8_t *	row_buffer,
		const size_t	key_width,
		const size_t	row_size,
		const size_t	first,
		const size_t	last
	)

Definition at line 276 of file TopKSort.cu.

References CHECK, EMPTY_KEY_32, and EMPTY_KEY_64.

Referenced by pop_n_rows_from_merged_heaps_gpu().

                        {
   switch (key_width) {
     case 4:
       thrust::for_each(
           exec,
           thrust::make_counting_iterator(first),
           thrust::make_counting_iterator(last),
           KeyReseter<int32_t>(row_buffer, row_size, static_cast<int32_t>(EMPTY_KEY_32)));
       break;
     case 8:
       thrust::for_each(
           exec,
           thrust::make_counting_iterator(first),
           thrust::make_counting_iterator(last),
           KeyReseter<int64_t>(row_buffer, row_size, static_cast<int64_t>(EMPTY_KEY_64)));
       break;
     default:
       CHECK(false);
   }
 }

Here is the caller graph for this function:

template<class K , class I >

void sort_indices_by_key	(	thrust::device_ptr< I >	d_idx_first,
		const size_t	idx_count,
		const thrust::device_ptr< K > &	d_key_buffer,
		const bool	desc,
		ThrustAllocator &	allocator,
		const int	device_id
	)

Definition at line 157 of file TopKSort.cu.

References checkCudaErrors, and getQueryEngineCudaStreamForDevice().

Referenced by do_radix_sort().

                                               {
   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
   if (desc) {
     thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),
                         d_key_buffer,
                         d_key_buffer + idx_count,
                         d_idx_first,
                         thrust::greater<K>());
   } else {
     thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),
                         d_key_buffer,
                         d_key_buffer + idx_count,
                         d_idx_first);
   }
   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Classes

Macros

Functions

Macro Definition Documentation

Function Documentation