#include <cuda.h>
#include "BufferCompaction.h"
#include "GpuMemUtils.h"
#include "GpuRtConstants.h"
#include "ResultSetBufferAccessors.h"
#include "ResultSetSortImpl.h"
#include "SortUtils.cuh"
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include "BufferEntryUtils.h"

Include dependency graph for ResultSetSortImpl.cu:

Namespaces
	anonymous_namespace{ResultSetSortImpl.cu}

Macros
#define	checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)

#define	FORCE_CPU_VERSION

Functions
CUstream	getQueryEngineCudaStreamForDevice (int device_num)

template<class K , class V , class I >
std::vector< uint32_t >	anonymous_namespace{ResultSetSortImpl.cu}::do_radix_sort (const ExecutorDeviceType device_type, const int device_id, ThrustAllocator &thrust_allocator, const int8_t *groupby_buffer, V dev_oe_col_buffer_begin, V dev_oe_col_buffer_end, I dev_idx_buff_begin, const size_t dev_idx_buff_size, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n)

void	anonymous_namespace{ResultSetSortImpl.cu}::add_nulls (std::vector< uint32_t > &idx_buff, const std::vector< uint32_t > &null_idx_buff, const PodOrderEntry &oe)

template<typename T >
thrust::device_ptr< T >	anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr (const thrust::host_vector< T > &host_vec, ThrustAllocator &thrust_allocator)

template<class K >
std::vector< uint32_t >	anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_fp (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr data_mgr, const int8_t groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)

template<class K >
std::vector< uint32_t >	anonymous_namespace{ResultSetSortImpl.cu}::baseline_sort_int (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr data_mgr, const int8_t groupby_buffer, const thrust::host_vector< int64_t > &oe_col_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)

template<class K >
thrust::host_vector< int64_t >	anonymous_namespace{ResultSetSortImpl.cu}::collect_order_entry_column (const int8_t *groupby_buffer, const GroupByBufferLayoutInfo &layout, const size_t start, const size_t step)

template<class K >
std::vector< uint32_t >	baseline_sort (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr data_mgr, const int8_t groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)

template std::vector< uint32_t >	baseline_sort< int32_t > (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr data_mgr, const int8_t groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)

template std::vector< uint32_t >	baseline_sort< int64_t > (const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr data_mgr, const int8_t groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)

Macro Definition Documentation

#define checkCudaErrors ( err ) CHECK_EQ(err, CUDA_SUCCESS)

Definition at line 16 of file ResultSetSortImpl.cu.

#define FORCE_CPU_VERSION

Definition at line 18 of file ResultSetSortImpl.cu.

Function Documentation

template<class K >

std::vector<uint32_t> baseline_sort	(	const ExecutorDeviceType	device_type,
		const int	device_id,
		Data_Namespace::DataMgr *	data_mgr,
		const int8_t *	groupby_buffer,
		const PodOrderEntry &	oe,
		const GroupByBufferLayoutInfo &	layout,
		const size_t	top_n,
		const size_t	start,
		const size_t	step
	)

Definition at line 353 of file ResultSetSortImpl.cu.

References CHECK, CPU, get_compact_type(), anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr(), GPU, PodOrderEntry::is_desc, kAVG, and PodOrderEntry::nulls_first.

                                                        {
   auto oe_col_buffer = collect_order_entry_column<K>(groupby_buffer, layout, start, step);
   const auto& entry_ti = get_compact_type(layout.oe_target_info);
   CHECK(entry_ti.is_number());
   if (entry_ti.is_fp() || layout.oe_target_info.agg_kind == kAVG) {
     return baseline_sort_fp<K>(device_type,
                                device_id,
                                data_mgr,
                                groupby_buffer,
                                oe_col_buffer,
                                oe,
                                layout,
                                top_n,
                                start,
                                step);
   }
   // Because of how we represent nulls for integral types, they'd be at the
   // wrong position in these two cases. Separate them into a different buffer.
   if ((oe.is_desc && oe.nulls_first) || (!oe.is_desc && !oe.nulls_first)) {
     return baseline_sort_int<K>(device_type,
                                 device_id,
                                 data_mgr,
                                 groupby_buffer,
                                 oe_col_buffer,
                                 oe,
                                 layout,
                                 top_n,
                                 start,
                                 step);
   }
   ThrustAllocator thrust_allocator(data_mgr, device_id);
   // Fastest path, no need to separate nulls away since they'll end up at the
   // right place as a side effect of how we're representing nulls.
   if (device_type == ExecutorDeviceType::GPU) {
     if (oe_col_buffer.empty()) {
       return {};
     }
     const auto dev_idx_buff =
         get_device_ptr<uint32_t>(oe_col_buffer.size(), thrust_allocator);
     thrust::sequence(dev_idx_buff, dev_idx_buff + oe_col_buffer.size(), start, step);
     const auto dev_oe_col_buffer = get_device_copy_ptr(oe_col_buffer, thrust_allocator);
     return do_radix_sort<K>(device_type,
                             device_id,
                             thrust_allocator,
                             groupby_buffer,
                             dev_oe_col_buffer,
                             dev_oe_col_buffer + oe_col_buffer.size(),
                             dev_idx_buff,
                             oe_col_buffer.size(),
                             oe,
                             layout,
                             top_n);
   }
   CHECK(device_type == ExecutorDeviceType::CPU);
   thrust::host_vector<uint32_t> host_idx_buff(oe_col_buffer.size());
   thrust::sequence(host_idx_buff.begin(), host_idx_buff.end(), start, step);
   return do_radix_sort<K>(device_type,
                           device_id,
                           thrust_allocator,
                           groupby_buffer,
                           oe_col_buffer.begin(),
                           oe_col_buffer.end(),
                           host_idx_buff.begin(),
                           host_idx_buff.size(),
                           oe,
                           layout,
                           top_n);
 }

Here is the call graph for this function:

template std::vector<uint32_t> baseline_sort< int32_t >	(	const ExecutorDeviceType	device_type,
		const int	device_id,
		Data_Namespace::DataMgr *	data_mgr,
		const int8_t *	groupby_buffer,
		const PodOrderEntry &	oe,
		const GroupByBufferLayoutInfo &	layout,
		const size_t	top_n,
		const size_t	start,
		const size_t	step
	)

template std::vector<uint32_t> baseline_sort< int64_t >	(	const ExecutorDeviceType	device_type,
		const int	device_id,
		Data_Namespace::DataMgr *	data_mgr,
		const int8_t *	groupby_buffer,
		const PodOrderEntry &	oe,
		const GroupByBufferLayoutInfo &	layout,
		const size_t	top_n,
		const size_t	start,
		const size_t	step
	)

CUstream getQueryEngineCudaStreamForDevice ( int device_num )

Definition at line 7 of file QueryEngine.cpp.

                     {  // NOTE: CUstream is cudaStream_t
   return QueryEngine::getInstance()->getCudaStreamForDevice(device_num);
 }

Namespaces

Macros

Functions

Macro Definition Documentation

Function Documentation