_top_k_sort_8cu_source.html

 /*

  * Copyright 2022 HEAVY.AI, Inc.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  *     http://www.apache.org/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */


 #include "BufferEntryUtils.h"

 #include "GpuMemUtils.h"

 #include "ResultSetBufferAccessors.h"

 #include "SortUtils.cuh"

 #include "StreamingTopN.h"

 #include "TopKSort.h"


 #include <thrust/copy.h>

 #include <thrust/execution_policy.h>

 #include <thrust/functional.h>

 #include <thrust/partition.h>

 #include <thrust/sort.h>


 #include <cuda.h>

 CUstream getQueryEngineCudaStreamForDevice(int device_num);


 #define checkCudaErrors(err) CHECK_EQ(err, CUDA_SUCCESS)


 #include <iostream>


 template <class K, class I = int32_t>

 struct is_taken_entry {

   is_taken_entry(const int8_t* buff, const size_t stride)

       : buff_ptr(buff), key_stride(stride) {}

   __host__ __device__ bool operator()(const I index) {

     return !is_empty_entry<K>(static_cast<size_t>(index), buff_ptr, key_stride);

   }

   const int8_t* buff_ptr;

   const size_t key_stride;

 };


 template <class K, class I = int32_t>

 struct is_null_order_entry {

   using argument_type = I;

   is_null_order_entry(const int8_t* base, const size_t stride, const int64_t nul)

       : oe_base(base), oe_stride(stride), null_val(nul) {}

   __host__ __device__ bool operator()(const I index) {

     const auto oe_val = *reinterpret_cast<const K*>(oe_base + index * oe_stride);

     switch (sizeof(K)) {

       case 4:

         return *reinterpret_cast<const int32_t*>(&oe_val) ==

                static_cast<int32_t>(null_val);

       case 8:

         return *reinterpret_cast<const int64_t*>(&oe_val) == null_val;

       default:

         return false;

     }

   }

   const int8_t* oe_base;

   const size_t oe_stride;

   const int64_t null_val;

 };


 template <typename ForwardIterator>

 ForwardIterator partition_by_null(ForwardIterator first,

                                   ForwardIterator last,

                                   const int64_t null_val,

                                   const bool nulls_first,

                                   const int8_t* rows_ptr,

                                   const GroupByBufferLayoutInfo& layout) {

   if (nulls_first) {

     return (layout.col_bytes == 4)

                ? thrust::partition(

                      first,

                      last,

                      is_null_order_entry<int32_t>(

                          rows_ptr + layout.col_off, layout.row_bytes, null_val))

                : thrust::partition(

                      first,

                      last,

                      is_null_order_entry<int64_t>(

                          rows_ptr + layout.col_off, layout.row_bytes, null_val));

   } else {

     return (layout.col_bytes == 4)

                ? thrust::partition(

                      first,

                      last,

                      thrust::not1(is_null_order_entry<int32_t>(

                          rows_ptr + layout.col_off, layout.row_bytes, null_val)))

                : thrust::partition(

                      first,

                      last,

                      thrust::not1(is_null_order_entry<int64_t>(

                          rows_ptr + layout.col_off, layout.row_bytes, null_val)));

   }

 }


 template <class K, class I>

 struct KeyFetcher {

   KeyFetcher(K* out_base,

              const int8_t* src_oe_base,

              const size_t stride,

              const I* indices)

       : key_base(out_base), oe_base(src_oe_base), oe_stride(stride), idx_base(indices) {}

   __host__ __device__ void operator()(const I index) {

     key_base[index] = *reinterpret_cast<const K*>(oe_base + idx_base[index] * oe_stride);

   }


   K* key_base;

   const int8_t* oe_base;

   const size_t oe_stride;

   const I* idx_base;

 };


 template <class K>

 struct KeyReseter {

   KeyReseter(int8_t* out_base, const size_t stride, const K emp_key)

       : rows_base(out_base), key_stride(stride), empty_key(emp_key) {}

   __host__ __device__ void operator()(const size_t index) {

     K* key_ptr = reinterpret_cast<K*>(rows_base + index * key_stride);

     *key_ptr = empty_key;

   }


   int8_t* rows_base;

   const size_t key_stride;

   const K empty_key;

 };


 // TODO(miyu) : switch to shared version in ResultSetSortImpl.cu.

 template <class K, class I>

 void collect_order_entry_column(thrust::device_ptr<K>& d_oe_col_buffer,

                                 const int8_t* d_src_buffer,

                                 const thrust::device_ptr<I>& d_idx_first,

                                 const size_t idx_count,

                                 const size_t oe_offset,

                                 const size_t oe_stride,

                                 ThrustAllocator& allocator,

                                 const int device_id) {

   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);

   thrust::for_each(thrust::cuda::par(allocator).on(qe_cuda_stream),

                    thrust::make_counting_iterator(size_t(0)),

                    thrust::make_counting_iterator(idx_count),

                    KeyFetcher<K, I>(thrust::raw_pointer_cast(d_oe_col_buffer),

                                     d_src_buffer + oe_offset,

                                     oe_stride,

                                     thrust::raw_pointer_cast(d_idx_first)));

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));

 }


 template <class K, class I>

 void sort_indices_by_key(thrust::device_ptr<I> d_idx_first,

                          const size_t idx_count,

                          const thrust::device_ptr<K>& d_key_buffer,

                          const bool desc,

                          ThrustAllocator& allocator,

                          const int device_id) {

   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);

   if (desc) {

     thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),

                         d_key_buffer,

                         d_key_buffer + idx_count,

                         d_idx_first,

                         thrust::greater<K>());

   } else {

     thrust::sort_by_key(thrust::cuda::par(allocator).on(qe_cuda_stream),

                         d_key_buffer,

                         d_key_buffer + idx_count,

                         d_idx_first);

   }

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));

 }


 template <class I = int32_t>

 void do_radix_sort(thrust::device_ptr<I> d_idx_first,

                    const size_t idx_count,

                    const int8_t* d_src_buffer,

                    const PodOrderEntry& oe,

                    const GroupByBufferLayoutInfo& layout,

                    ThrustAllocator& allocator,

                    const int device_id) {

   const auto& oe_type = layout.oe_target_info.sql_type;

   if (oe_type.is_fp()) {

     switch (layout.col_bytes) {

       case 4: {

         auto d_oe_buffer = get_device_ptr<float>(idx_count, allocator);

         collect_order_entry_column(d_oe_buffer,

                                    d_src_buffer,

                                    d_idx_first,

                                    idx_count,

                                    layout.col_off,

                                    layout.row_bytes,

                                    allocator,

                                    device_id);

         sort_indices_by_key(

             d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);

         break;

       }

       case 8: {

         auto d_oe_buffer = get_device_ptr<double>(idx_count, allocator);

         collect_order_entry_column(d_oe_buffer,

                                    d_src_buffer,

                                    d_idx_first,

                                    idx_count,

                                    layout.col_off,

                                    layout.row_bytes,

                                    allocator,

                                    device_id);

         sort_indices_by_key(

             d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);

         break;

       }

       default:

         CHECK(false);

     }

     return;

   }

   CHECK(oe_type.is_number() || oe_type.is_time());

   switch (layout.col_bytes) {

     case 4: {

       auto d_oe_buffer = get_device_ptr<int32_t>(idx_count, allocator);

       collect_order_entry_column(d_oe_buffer,

                                  d_src_buffer,

                                  d_idx_first,

                                  idx_count,

                                  layout.col_off,

                                  layout.row_bytes,

                                  allocator,

                                  device_id);

       sort_indices_by_key(

           d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);

       break;

     }

     case 8: {

       auto d_oe_buffer = get_device_ptr<int64_t>(idx_count, allocator);

       collect_order_entry_column(d_oe_buffer,

                                  d_src_buffer,

                                  d_idx_first,

                                  idx_count,

                                  layout.col_off,

                                  layout.row_bytes,

                                  allocator,

                                  device_id);

       sort_indices_by_key(

           d_idx_first, idx_count, d_oe_buffer, oe.is_desc, allocator, device_id);

       break;

     }

     default:

       CHECK(false);

   }

 }


 template <class I>

 struct RowFetcher {

   RowFetcher(int8_t* out_base,

              const int8_t* in_base,

              const I* indices,

              const size_t row_sz)

       : dst_base(out_base), src_base(in_base), idx_base(indices), row_size(row_sz) {}

   __host__ __device__ void operator()(const I index) {

     memcpy(dst_base + index * row_size, src_base + idx_base[index] * row_size, row_size);

   }


   int8_t* dst_base;

   const int8_t* src_base;

   const I* idx_base;

   const size_t row_size;

 };


 template <typename DerivedPolicy>

 void reset_keys_in_row_buffer(

     const thrust::detail::execution_policy_base<DerivedPolicy>& exec,

     int8_t* row_buffer,

     const size_t key_width,

     const size_t row_size,

     const size_t first,

     const size_t last) {

   switch (key_width) {

     case 4:

       thrust::for_each(

           exec,

           thrust::make_counting_iterator(first),

           thrust::make_counting_iterator(last),

           KeyReseter<int32_t>(row_buffer, row_size, static_cast<int32_t>(EMPTY_KEY_32)));

       break;

     case 8:

       thrust::for_each(

           exec,

           thrust::make_counting_iterator(first),

           thrust::make_counting_iterator(last),

           KeyReseter<int64_t>(row_buffer, row_size, static_cast<int64_t>(EMPTY_KEY_64)));

       break;

     default:

       CHECK(false);

   }

 }


 std::vector<int8_t> pop_n_rows_from_merged_heaps_gpu(

     Data_Namespace::DataMgr* data_mgr,

     const int64_t* dev_heaps,

     const size_t heaps_size,

     const size_t n,

     const PodOrderEntry& oe,

     const GroupByBufferLayoutInfo& layout,

     const size_t group_key_bytes,

     const size_t thread_count,

     const int device_id) {

   const auto row_size = layout.row_bytes;

   CHECK_EQ(heaps_size, streaming_top_n::get_heap_size(row_size, n, thread_count));

   const int8_t* rows_ptr = reinterpret_cast<const int8_t*>(dev_heaps) +

                            streaming_top_n::get_rows_offset_of_heaps(n, thread_count);

   const auto total_entry_count = n * thread_count;

   ThrustAllocator thrust_allocator(data_mgr, device_id);

   auto d_indices = get_device_ptr<int32_t>(total_entry_count, thrust_allocator);

   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);

   thrust::sequence(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),

                    d_indices,

                    d_indices + total_entry_count);

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));

   auto separator =

       (group_key_bytes == 4)

           ? thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),

                               d_indices,

                               d_indices + total_entry_count,

                               is_taken_entry<int32_t>(rows_ptr, row_size))

           : thrust::partition(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),

                               d_indices,

                               d_indices + total_entry_count,

                               is_taken_entry<int64_t>(rows_ptr, row_size));

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));

   const size_t actual_entry_count = separator - d_indices;

   if (!actual_entry_count) {

     std::vector<int8_t> top_rows(row_size * n);

     reset_keys_in_row_buffer(

         thrust::host, &top_rows[0], layout.col_bytes, row_size, 0, n);

     return top_rows;

   }


   const auto& oe_type = layout.oe_target_info.sql_type;

   if (oe_type.get_notnull()) {

     do_radix_sort(

         d_indices, actual_entry_count, rows_ptr, oe, layout, thrust_allocator, device_id);

   } else {

     auto separator = partition_by_null(d_indices,

                                        d_indices + actual_entry_count,

                                        null_val_bit_pattern(oe_type, false),

                                        oe.nulls_first,

                                        rows_ptr,

                                        layout);

     if (oe.nulls_first) {

       const size_t null_count = separator - d_indices;

       if (null_count < actual_entry_count) {

         do_radix_sort(separator,

                       actual_entry_count - null_count,

                       rows_ptr,

                       oe,

                       layout,

                       thrust_allocator,

                       device_id);

       }

     } else {

       const size_t nonnull_count = separator - d_indices;

       if (nonnull_count > 0) {

         do_radix_sort(

             d_indices, nonnull_count, rows_ptr, oe, layout, thrust_allocator, device_id);

       }

     }

   }


   const auto final_entry_count = std::min(n, actual_entry_count);

   auto d_top_rows = get_device_ptr<int8_t>(row_size * n, thrust_allocator);

   thrust::for_each(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),

                    thrust::make_counting_iterator(size_t(0)),

                    thrust::make_counting_iterator(final_entry_count),

                    RowFetcher<int32_t>(thrust::raw_pointer_cast(d_top_rows),

                                        rows_ptr,

                                        thrust::raw_pointer_cast(d_indices),

                                        row_size));

   checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));


   if (final_entry_count < n) {

     reset_keys_in_row_buffer(thrust::cuda::par(thrust_allocator).on(qe_cuda_stream),

                              thrust::raw_pointer_cast(d_top_rows),

                              layout.col_bytes,

                              row_size,

                              final_entry_count,

                              n);

     checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));

   }


   std::vector<int8_t> top_rows(row_size * n);

   thrust::copy(d_top_rows, d_top_rows + row_size * n, top_rows.begin());

   return top_rows;

 }

CHECK_EQ
#define CHECK_EQ(x, y)
Definition: Logger.h:301

reset_keys_in_row_buffer
void reset_keys_in_row_buffer(const thrust::detail::execution_policy_base< DerivedPolicy > &exec, int8_t *row_buffer, const size_t key_width, const size_t row_size, const size_t first, const size_t last)
Definition: TopKSort.cu:276

RowFetcher
Definition: TopKSort.cu:259

TopKSort.h

EMPTY_KEY_64
#define EMPTY_KEY_64
Definition: GpuRtConstants.h:28

KeyFetcher::idx_base
const I * idx_base
Definition: TopKSort.cu:118

KeyReseter::operator()
__host__ __device__ void operator()(const size_t index)
Definition: TopKSort.cu:125

KeyReseter
Definition: TopKSort.cu:122

RowFetcher::dst_base
int8_t * dst_base
Definition: TopKSort.cu:269

KeyReseter::empty_key
const K empty_key
Definition: TopKSort.cu:132

KeyFetcher::operator()
__host__ __device__ void operator()(const I index)
Definition: TopKSort.cu:111

is_null_order_entry::is_null_order_entry
is_null_order_entry(const int8_t *base, const size_t stride, const int64_t nul)
Definition: TopKSort.cu:51

RowFetcher::row_size
const size_t row_size
Definition: TopKSort.cu:272

ResultSetBufferAccessors.h
Utility functions for easy access to the result set buffers.

CUstream
void * CUstream
Definition: nocuda.h:23

TargetInfo::sql_type
SQLTypeInfo sql_type
Definition: TargetInfo.h:52

GroupByBufferLayoutInfo::row_bytes
const size_t row_bytes
Definition: ResultSetSortImpl.h:33

StreamingTopN.h
Streaming Top N algorithm.

KeyFetcher::oe_base
const int8_t * oe_base
Definition: TopKSort.cu:116

streaming_top_n::get_rows_offset_of_heaps
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
Definition: StreamingTopN.cpp:29

is_taken_entry
Definition: TopKSort.cu:38

ThrustAllocator
Definition: ThrustAllocator.h:37

is_null_order_entry::argument_type
I argument_type
Definition: TopKSort.cu:50

pop_n_rows_from_merged_heaps_gpu
std::vector< int8_t > pop_n_rows_from_merged_heaps_gpu(Data_Namespace::DataMgr *data_mgr, const int64_t *dev_heaps, const size_t heaps_size, const size_t n, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t group_key_bytes, const size_t thread_count, const int device_id)
Definition: TopKSort.cu:303

null_val_bit_pattern
int64_t null_val_bit_pattern(const SQLTypeInfo &ti, const bool float_argument_input)
Definition: ResultSetBufferAccessors.h:229

PodOrderEntry::nulls_first
bool nulls_first
Definition: ResultSetSortImpl.h:26

is_taken_entry::buff_ptr
const int8_t * buff_ptr
Definition: TopKSort.cu:44

gpu_enabled::copy
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51

RowFetcher::src_base
const int8_t * src_base
Definition: TopKSort.cu:270

KeyFetcher::KeyFetcher
KeyFetcher(K *out_base, const int8_t *src_oe_base, const size_t stride, const I *indices)
Definition: TopKSort.cu:106

BufferEntryUtils.h
Utility functions for group by buffer entries.

generate_TableFunctionsFactory_init.separator
string separator
Definition: generate_TableFunctionsFactory_init.py:75

collect_order_entry_column
void collect_order_entry_column(thrust::device_ptr< K > &d_oe_col_buffer, const int8_t *d_src_buffer, const thrust::device_ptr< I > &d_idx_first, const size_t idx_count, const size_t oe_offset, const size_t oe_stride, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:137

KeyFetcher::oe_stride
const size_t oe_stride
Definition: TopKSort.cu:117

is_taken_entry::is_taken_entry
is_taken_entry(const int8_t *buff, const size_t stride)
Definition: TopKSort.cu:39

is_null_order_entry
Definition: TopKSort.cu:49

RowFetcher::operator()
__host__ __device__ void operator()(const I index)
Definition: TopKSort.cu:265

GroupByBufferLayoutInfo::col_off
const size_t col_off
Definition: ResultSetSortImpl.h:31

KeyReseter::KeyReseter
KeyReseter(int8_t *out_base, const size_t stride, const K emp_key)
Definition: TopKSort.cu:123

do_radix_sort
void do_radix_sort(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const int8_t *d_src_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:180

Data_Namespace::DataMgr
Definition: DataMgr.h:125

is_null_order_entry::oe_base
const int8_t * oe_base
Definition: TopKSort.cu:65

is_taken_entry::key_stride
const size_t key_stride
Definition: TopKSort.cu:45

RowFetcher::idx_base
const I * idx_base
Definition: TopKSort.cu:271

KeyReseter::rows_base
int8_t * rows_base
Definition: TopKSort.cu:130

is_null_order_entry::null_val
const int64_t null_val
Definition: TopKSort.cu:67

is_taken_entry::operator()
__host__ __device__ bool operator()(const I index)
Definition: TopKSort.cu:41

getQueryEngineCudaStreamForDevice
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7

PodOrderEntry::is_desc
bool is_desc
Definition: ResultSetSortImpl.h:25

KeyFetcher::key_base
K * key_base
Definition: TopKSort.cu:115

RowFetcher::RowFetcher
RowFetcher(int8_t *out_base, const int8_t *in_base, const I *indices, const size_t row_sz)
Definition: TopKSort.cu:260

GpuMemUtils.h

streaming_top_n::get_heap_size
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
Definition: StreamingTopN.cpp:24

GroupByBufferLayoutInfo::oe_target_info
const TargetInfo oe_target_info
Definition: ResultSetSortImpl.h:34

KeyFetcher
Definition: TopKSort.cu:105

CHECK
#define CHECK(condition)
Definition: Logger.h:291

PodOrderEntry
Definition: ResultSetSortImpl.h:23

checkCudaErrors
#define checkCudaErrors(err)
Definition: GpuInitGroups.cu:9

EMPTY_KEY_32
#define EMPTY_KEY_32
Definition: GpuRtConstants.h:29

is_null_order_entry::operator()
__host__ __device__ bool operator()(const I index)
Definition: TopKSort.cu:53

GroupByBufferLayoutInfo
Definition: ResultSetSortImpl.h:29

anonymous_namespace{Utm.h}::n
constexpr double n
Definition: Utm.h:38

KeyReseter::key_stride
const size_t key_stride
Definition: TopKSort.cu:131

partition_by_null
ForwardIterator partition_by_null(ForwardIterator first, ForwardIterator last, const int64_t null_val, const bool nulls_first, const int8_t *rows_ptr, const GroupByBufferLayoutInfo &layout)
Definition: TopKSort.cu:71

sort_indices_by_key
void sort_indices_by_key(thrust::device_ptr< I > d_idx_first, const size_t idx_count, const thrust::device_ptr< K > &d_key_buffer, const bool desc, ThrustAllocator &allocator, const int device_id)
Definition: TopKSort.cu:157

GroupByBufferLayoutInfo::col_bytes
const size_t col_bytes
Definition: ResultSetSortImpl.h:32

is_null_order_entry::oe_stride
const size_t oe_stride
Definition: TopKSort.cu:66