OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ResultSetSort.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #ifdef HAVE_CUDA
26 #include "Execute.h"
27 #include "ResultSet.h"
28 #include "ResultSetSortImpl.h"
29 
30 #include "../Shared/thread_count.h"
31 
32 #include <future>
33 
34 std::unique_ptr<CudaMgr_Namespace::CudaMgr> g_cuda_mgr; // for unit tests only
35 
36 namespace {
37 
38 void set_cuda_context(Data_Namespace::DataMgr* data_mgr, const int device_id) {
39  if (data_mgr) {
40  data_mgr->getCudaMgr()->setContext(device_id);
41  return;
42  }
43  // for unit tests only
44  CHECK(g_cuda_mgr);
45  g_cuda_mgr->setContext(device_id);
46 }
47 
48 } // namespace
49 
50 void ResultSet::doBaselineSort(const ExecutorDeviceType device_type,
51  const std::list<Analyzer::OrderEntry>& order_entries,
52  const size_t top_n) {
53  CHECK_EQ(size_t(1), order_entries.size());
55  const auto& oe = order_entries.front();
56  CHECK_GT(oe.tle_no, 0);
57  CHECK_LE(static_cast<size_t>(oe.tle_no), targets_.size());
58  size_t logical_slot_idx = 0;
59  size_t physical_slot_off = 0;
60  for (size_t i = 0; i < static_cast<size_t>(oe.tle_no - 1); ++i) {
61  physical_slot_off += query_mem_desc_.getPaddedSlotWidthBytes(logical_slot_idx);
62  logical_slot_idx =
64  }
65  const auto col_off =
66  get_slot_off_quad(query_mem_desc_) * sizeof(int64_t) + physical_slot_off;
67  const size_t col_bytes = query_mem_desc_.getPaddedSlotWidthBytes(logical_slot_idx);
68  const auto row_bytes = get_row_bytes(query_mem_desc_);
69  const auto target_groupby_indices_sz = query_mem_desc_.targetGroupbyIndicesSize();
70  CHECK(target_groupby_indices_sz == 0 ||
71  static_cast<size_t>(oe.tle_no) <= target_groupby_indices_sz);
72  const ssize_t target_groupby_index{
73  target_groupby_indices_sz == 0
74  ? -1
75  : query_mem_desc_.getTargetGroupbyIndex(oe.tle_no - 1)};
77  col_off,
78  col_bytes,
79  row_bytes,
80  targets_[oe.tle_no - 1],
81  target_groupby_index};
82  PodOrderEntry pod_oe{oe.tle_no, oe.is_desc, oe.nulls_first};
83  auto groupby_buffer = storage_->getUnderlyingBuffer();
84  auto data_mgr = getDataManager();
85  const auto step = static_cast<size_t>(
86  device_type == ExecutorDeviceType::GPU ? getGpuCount() : cpu_threads());
87  CHECK_GE(step, size_t(1));
88  const auto key_bytewidth = query_mem_desc_.getEffectiveKeyWidth();
89  if (step > 1) {
90  std::vector<std::future<void>> top_futures;
91  std::vector<std::vector<uint32_t>> strided_permutations(step);
92  for (size_t start = 0; start < step; ++start) {
93  top_futures.emplace_back(std::async(
94  std::launch::async,
95  [&strided_permutations,
96  data_mgr,
97  device_type,
98  groupby_buffer,
99  pod_oe,
100  key_bytewidth,
101  layout,
102  top_n,
103  start,
104  step] {
105  if (device_type == ExecutorDeviceType::GPU) {
106  set_cuda_context(data_mgr, start);
107  }
108  strided_permutations[start] = (key_bytewidth == 4)
109  ? baseline_sort<int32_t>(device_type,
110  start,
111  data_mgr,
112  groupby_buffer,
113  pod_oe,
114  layout,
115  top_n,
116  start,
117  step)
118  : baseline_sort<int64_t>(device_type,
119  start,
120  data_mgr,
121  groupby_buffer,
122  pod_oe,
123  layout,
124  top_n,
125  start,
126  step);
127  }));
128  }
129  for (auto& top_future : top_futures) {
130  top_future.wait();
131  }
132  for (auto& top_future : top_futures) {
133  top_future.get();
134  }
135  permutation_.reserve(strided_permutations.size() * top_n);
136  for (const auto& strided_permutation : strided_permutations) {
137  permutation_.insert(
138  permutation_.end(), strided_permutation.begin(), strided_permutation.end());
139  }
140  auto compare = createComparator(order_entries, true);
141  topPermutation(permutation_, top_n, compare);
142  return;
143  } else {
144  permutation_ =
145  (key_bytewidth == 4)
147  device_type, 0, data_mgr, groupby_buffer, pod_oe, layout, top_n, 0, 1)
149  device_type, 0, data_mgr, groupby_buffer, pod_oe, layout, top_n, 0, 1);
150  }
151 }
152 
154  const std::list<Analyzer::OrderEntry>& order_entries,
155  const size_t top_n) {
156  if (order_entries.size() != 1 || query_mem_desc_.hasKeylessHash() ||
158  return false;
159  }
160  const auto& order_entry = order_entries.front();
161  CHECK_GE(order_entry.tle_no, 1);
162  CHECK_LE(static_cast<size_t>(order_entry.tle_no), targets_.size());
163  const auto& target_info = targets_[order_entry.tle_no - 1];
164  if (!target_info.sql_type.is_number() || is_distinct_target(target_info)) {
165  return false;
166  }
170  top_n;
171 }
172 
174  if (executor_) {
176  return &executor_->catalog_->getDataMgr();
177  }
178  return nullptr;
179 }
180 
181 int ResultSet::getGpuCount() const {
182  const auto data_mgr = getDataManager();
183  if (!data_mgr) {
184  return g_cuda_mgr ? g_cuda_mgr->getDeviceCount() : 0;
185  }
186  return data_mgr->gpusPresent() ? data_mgr->getCudaMgr()->getDeviceCount() : 0;
187 }
188 #endif // HAVE_CUDA
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:117
#define CHECK_EQ(x, y)
Definition: Logger.h:198
ExecutorDeviceType
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
const Executor * executor_
Definition: ResultSet.h:814
void setContext(const int device_num) const
Definition: CudaMgr.cpp:311
QueryMemoryDescriptor query_mem_desc_
Definition: ResultSet.h:803
#define CHECK_GE(x, y)
Definition: Logger.h:203
std::unique_ptr< ResultSetStorage > storage_
Definition: ResultSet.h:804
size_t get_slot_off_quad(const QueryMemoryDescriptor &query_mem_desc)
size_t getEffectiveKeyWidth() const
#define CHECK_GT(x, y)
Definition: Logger.h:202
std::vector< uint32_t > permutation_
Definition: ResultSet.h:811
const std::vector< TargetInfo > targets_
Definition: ResultSet.h:800
size_t advance_slot(const size_t j, const TargetInfo &target_info, const bool separate_varlen_storage)
CHECK(cgen_state)
int getDeviceCount() const
Definition: CudaMgr.h:93
const Catalog_Namespace::Catalog * catalog_
Definition: Execute.h:1019
void doBaselineSort(const ExecutorDeviceType device_type, const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n)
size_t targetGroupbyIndicesSize() const
bool canUseFastBaselineSort(const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n)
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:116
static void topPermutation(std::vector< uint32_t > &to_sort, const size_t n, const std::function< bool(const uint32_t, const uint32_t)> compare)
Definition: ResultSet.cpp:748
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
int tle_no
ssize_t getTargetGroupbyIndex(const size_t target_idx) const
QueryDescriptionType getQueryDescriptionType() const
bool isSingleColumnGroupByWithPerfectHash() const
#define CHECK_LE(x, y)
Definition: Logger.h:201
size_t get_row_bytes(const QueryMemoryDescriptor &query_mem_desc)
int getGpuCount() const
Basic constructors and methods of the row set interface.
bool separate_varlen_storage_valid_
Definition: ResultSet.h:835
std::function< bool(const uint32_t, const uint32_t)> createComparator(const std::list< Analyzer::OrderEntry > &order_entries, const bool use_heap)
Definition: ResultSet.h:739
Data_Namespace::DataMgr * getDataManager() const
int cpu_threads()
Definition: thread_count.h:25
template std::vector< uint32_t > baseline_sort< int32_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)
template std::vector< uint32_t > baseline_sort< int64_t >(const ExecutorDeviceType device_type, const int device_id, Data_Namespace::DataMgr *data_mgr, const int8_t *groupby_buffer, const PodOrderEntry &oe, const GroupByBufferLayoutInfo &layout, const size_t top_n, const size_t start, const size_t step)