OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ResultSet.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
25 #include "ResultSet.h"
28 #include "Execute.h"
29 #include "GpuMemUtils.h"
30 #include "InPlaceSort.h"
32 #include "RuntimeFunctions.h"
33 #include "Shared/Intervals.h"
34 #include "Shared/SqlTypesLayout.h"
35 #include "Shared/checked_alloc.h"
36 #include "Shared/likely.h"
37 #include "Shared/thread_count.h"
38 #include "Shared/threadpool.h"
39 
40 #include <algorithm>
41 #include <bitset>
42 #include <future>
43 #include <numeric>
44 
45 extern bool g_use_tbb_pool;
46 
47 size_t g_parallel_top_min = 100e3;
48 size_t g_parallel_top_max = 20e6; // In effect only with g_enable_watchdog.
49 
50 void ResultSet::keepFirstN(const size_t n) {
51  CHECK_EQ(-1, cached_row_count_);
52  keep_first_ = n;
53 }
54 
55 void ResultSet::dropFirstN(const size_t n) {
56  CHECK_EQ(-1, cached_row_count_);
57  drop_first_ = n;
58 }
59 
60 ResultSet::ResultSet(const std::vector<TargetInfo>& targets,
61  const ExecutorDeviceType device_type,
63  const std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
64  const Catalog_Namespace::Catalog* catalog,
65  const unsigned block_size,
66  const unsigned grid_size)
67  : targets_(targets)
68  , device_type_(device_type)
69  , device_id_(-1)
70  , query_mem_desc_(query_mem_desc)
71  , crt_row_buff_idx_(0)
72  , fetched_so_far_(0)
73  , drop_first_(0)
74  , keep_first_(0)
75  , row_set_mem_owner_(row_set_mem_owner)
76  , catalog_(catalog)
77  , block_size_(block_size)
78  , grid_size_(grid_size)
79  , data_mgr_(nullptr)
80  , separate_varlen_storage_valid_(false)
81  , just_explain_(false)
82  , for_validation_only_(false)
83  , cached_row_count_(-1)
84  , geo_return_type_(GeoReturnType::WktString) {}
85 
86 ResultSet::ResultSet(const std::vector<TargetInfo>& targets,
87  const std::vector<ColumnLazyFetchInfo>& lazy_fetch_info,
88  const std::vector<std::vector<const int8_t*>>& col_buffers,
89  const std::vector<std::vector<int64_t>>& frag_offsets,
90  const std::vector<int64_t>& consistent_frag_sizes,
91  const ExecutorDeviceType device_type,
92  const int device_id,
94  const std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
95  const Catalog_Namespace::Catalog* catalog,
96  const unsigned block_size,
97  const unsigned grid_size)
98  : targets_(targets)
99  , device_type_(device_type)
100  , device_id_(device_id)
101  , query_mem_desc_(query_mem_desc)
102  , crt_row_buff_idx_(0)
103  , fetched_so_far_(0)
104  , drop_first_(0)
105  , keep_first_(0)
106  , row_set_mem_owner_(row_set_mem_owner)
107  , catalog_(catalog)
108  , block_size_(block_size)
109  , grid_size_(grid_size)
110  , lazy_fetch_info_(lazy_fetch_info)
111  , col_buffers_{col_buffers}
112  , frag_offsets_{frag_offsets}
113  , consistent_frag_sizes_{consistent_frag_sizes}
114  , data_mgr_(nullptr)
115  , separate_varlen_storage_valid_(false)
116  , just_explain_(false)
117  , for_validation_only_(false)
118  , cached_row_count_(-1)
119  , geo_return_type_(GeoReturnType::WktString) {}
120 
121 ResultSet::ResultSet(const std::shared_ptr<const Analyzer::Estimator> estimator,
122  const ExecutorDeviceType device_type,
123  const int device_id,
124  Data_Namespace::DataMgr* data_mgr)
125  : device_type_(device_type)
126  , device_id_(device_id)
127  , query_mem_desc_{}
128  , crt_row_buff_idx_(0)
129  , estimator_(estimator)
130  , data_mgr_(data_mgr)
131  , separate_varlen_storage_valid_(false)
132  , just_explain_(false)
133  , for_validation_only_(false)
134  , cached_row_count_(-1)
135  , geo_return_type_(GeoReturnType::WktString) {
136  if (device_type == ExecutorDeviceType::GPU) {
137  device_estimator_buffer_ = CudaAllocator::allocGpuAbstractBuffer(
138  data_mgr_, estimator_->getBufferSize(), device_id_);
139  data_mgr->getCudaMgr()->zeroDeviceMem(device_estimator_buffer_->getMemoryPtr(),
140  estimator_->getBufferSize(),
141  device_id_);
142  } else {
143  host_estimator_buffer_ =
144  static_cast<int8_t*>(checked_calloc(estimator_->getBufferSize(), 1));
145  }
146 }
147 
148 ResultSet::ResultSet(const std::string& explanation)
149  : device_type_(ExecutorDeviceType::CPU)
150  , device_id_(-1)
151  , fetched_so_far_(0)
152  , separate_varlen_storage_valid_(false)
153  , explanation_(explanation)
154  , just_explain_(true)
155  , for_validation_only_(false)
156  , cached_row_count_(-1)
157  , geo_return_type_(GeoReturnType::WktString) {}
158 
159 ResultSet::ResultSet(int64_t queue_time_ms,
160  int64_t render_time_ms,
161  const std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner)
162  : device_type_(ExecutorDeviceType::CPU)
163  , device_id_(-1)
164  , fetched_so_far_(0)
165  , row_set_mem_owner_(row_set_mem_owner)
166  , timings_(QueryExecutionTimings{queue_time_ms, render_time_ms, 0, 0})
167  , separate_varlen_storage_valid_(false)
168  , just_explain_(true)
169  , for_validation_only_(false)
170  , cached_row_count_(-1)
171  , geo_return_type_(GeoReturnType::WktString){};
172 
174  if (storage_) {
175  if (!storage_->buff_is_provided_) {
176  CHECK(storage_->getUnderlyingBuffer());
177  free(storage_->getUnderlyingBuffer());
178  }
179  }
180  for (auto& storage : appended_storage_) {
181  if (storage && !storage->buff_is_provided_) {
182  free(storage->getUnderlyingBuffer());
183  }
184  }
185  if (host_estimator_buffer_) {
186  CHECK(device_type_ == ExecutorDeviceType::CPU || device_estimator_buffer_);
187  free(host_estimator_buffer_);
188  }
189  if (device_estimator_buffer_) {
190  CHECK(data_mgr_);
191  data_mgr_->free(device_estimator_buffer_);
192  }
193 }
194 
196  return device_type_;
197 }
198 
200  CHECK(!storage_);
201  CHECK(row_set_mem_owner_);
202  auto buff = row_set_mem_owner_->allocate(
203  query_mem_desc_.getBufferSizeBytes(device_type_), /*thread_idx=*/0);
204  storage_.reset(
205  new ResultSetStorage(targets_, query_mem_desc_, buff, /*buff_is_provided=*/true));
206  return storage_.get();
207 }
208 
210  int8_t* buff,
211  const std::vector<int64_t>& target_init_vals,
212  std::shared_ptr<VarlenOutputInfo> varlen_output_info) const {
213  CHECK(buff);
214  CHECK(!storage_);
215  storage_.reset(new ResultSetStorage(targets_, query_mem_desc_, buff, true));
216  // TODO: add both to the constructor
217  storage_->target_init_vals_ = target_init_vals;
218  if (varlen_output_info) {
219  storage_->varlen_output_info_ = varlen_output_info;
220  }
221  return storage_.get();
222 }
223 
225  const std::vector<int64_t>& target_init_vals) const {
226  CHECK(!storage_);
227  CHECK(row_set_mem_owner_);
228  auto buff = row_set_mem_owner_->allocate(
229  query_mem_desc_.getBufferSizeBytes(device_type_), /*thread_idx=*/0);
230  storage_.reset(
231  new ResultSetStorage(targets_, query_mem_desc_, buff, /*buff_is_provided=*/true));
232  storage_->target_init_vals_ = target_init_vals;
233  return storage_.get();
234 }
235 
237  if (crt_row_buff_idx_ == 0) {
238  throw std::runtime_error("current row buffer iteration index is undefined");
239  }
240  return crt_row_buff_idx_ - 1;
241 }
242 
243 // Note: that.appended_storage_ does not get appended to this.
244 void ResultSet::append(ResultSet& that) {
245  CHECK_EQ(-1, cached_row_count_);
246  if (!that.storage_) {
247  return;
248  }
249  appended_storage_.push_back(std::move(that.storage_));
250  query_mem_desc_.setEntryCount(
251  query_mem_desc_.getEntryCount() +
252  appended_storage_.back()->query_mem_desc_.getEntryCount());
253  chunks_.insert(chunks_.end(), that.chunks_.begin(), that.chunks_.end());
254  col_buffers_.insert(
255  col_buffers_.end(), that.col_buffers_.begin(), that.col_buffers_.end());
256  frag_offsets_.insert(
257  frag_offsets_.end(), that.frag_offsets_.begin(), that.frag_offsets_.end());
258  consistent_frag_sizes_.insert(consistent_frag_sizes_.end(),
259  that.consistent_frag_sizes_.begin(),
260  that.consistent_frag_sizes_.end());
261  chunk_iters_.insert(
262  chunk_iters_.end(), that.chunk_iters_.begin(), that.chunk_iters_.end());
263  if (separate_varlen_storage_valid_) {
264  CHECK(that.separate_varlen_storage_valid_);
265  serialized_varlen_buffer_.insert(serialized_varlen_buffer_.end(),
266  that.serialized_varlen_buffer_.begin(),
267  that.serialized_varlen_buffer_.end());
268  }
269  for (auto& buff : that.literal_buffers_) {
270  literal_buffers_.push_back(std::move(buff));
271  }
272 }
273 
275  return storage_.get();
276 }
277 
278 size_t ResultSet::colCount() const {
279  return just_explain_ ? 1 : targets_.size();
280 }
281 
282 SQLTypeInfo ResultSet::getColType(const size_t col_idx) const {
283  if (just_explain_) {
284  return SQLTypeInfo(kTEXT, false);
285  }
286  CHECK_LT(col_idx, targets_.size());
287  return targets_[col_idx].agg_kind == kAVG ? SQLTypeInfo(kDOUBLE, false)
288  : targets_[col_idx].sql_type;
289 }
290 
291 namespace {
292 
293 size_t get_truncated_row_count(size_t total_row_count, size_t limit, size_t offset) {
294  if (total_row_count < offset) {
295  return 0;
296  }
297 
298  size_t total_truncated_row_count = total_row_count - offset;
299 
300  if (limit) {
301  return std::min(total_truncated_row_count, limit);
302  }
303 
304  return total_truncated_row_count;
305 }
306 
307 } // namespace
308 
309 size_t ResultSet::rowCount(const bool force_parallel) const {
310  if (just_explain_) {
311  return 1;
312  }
313  if (!permutation_.empty()) {
314  if (drop_first_ > permutation_.size()) {
315  return 0;
316  }
317  const auto limited_row_count = keep_first_ + drop_first_;
318  return limited_row_count ? std::min(limited_row_count, permutation_.size())
319  : permutation_.size();
320  }
321  if (cached_row_count_ != -1) {
322  CHECK_GE(cached_row_count_, 0);
323  return cached_row_count_;
324  }
325  if (!storage_) {
326  return 0;
327  }
328  if (permutation_.empty() &&
329  query_mem_desc_.getQueryDescriptionType() == QueryDescriptionType::Projection) {
330  return binSearchRowCount();
331  }
332  if (force_parallel || entryCount() > 20000) {
333  return parallelRowCount();
334  }
335  std::lock_guard<std::mutex> lock(row_iteration_mutex_);
336  moveToBegin();
337  size_t row_count{0};
338  while (true) {
339  auto crt_row = getNextRowUnlocked(false, false);
340  if (crt_row.empty()) {
341  break;
342  }
343  ++row_count;
344  }
345  moveToBegin();
346  return row_count;
347 }
348 
349 void ResultSet::setCachedRowCount(const size_t row_count) const {
350  CHECK(cached_row_count_ == -1 || cached_row_count_ == static_cast<int64_t>(row_count));
351  cached_row_count_ = row_count;
352 }
353 
355  if (!storage_) {
356  return 0;
357  }
358 
359  size_t row_count = storage_->binSearchRowCount();
360  for (auto& s : appended_storage_) {
361  row_count += s->binSearchRowCount();
362  }
363 
364  return get_truncated_row_count(row_count, getLimit(), drop_first_);
365 }
366 
368  auto execute_parallel_row_count = [this](auto counter_threads) -> size_t {
369  const size_t worker_count = cpu_threads();
370  for (size_t i = 0,
371  start_entry = 0,
372  stride = (entryCount() + worker_count - 1) / worker_count;
373  i < worker_count && start_entry < entryCount();
374  ++i, start_entry += stride) {
375  const auto end_entry = std::min(start_entry + stride, entryCount());
376  counter_threads.spawn(
377  [this](const size_t start, const size_t end) {
378  size_t row_count{0};
379  for (size_t i = start; i < end; ++i) {
380  if (!isRowAtEmpty(i)) {
381  ++row_count;
382  }
383  }
384  return row_count;
385  },
386  start_entry,
387  end_entry);
388  }
389  const auto row_counts = counter_threads.join();
390  const size_t row_count = std::accumulate(row_counts.begin(), row_counts.end(), 0);
391  return row_count;
392  };
393  // will fall back to futures threadpool if TBB is not enabled
394  const auto row_count =
396  ? execute_parallel_row_count(threadpool::ThreadPool<size_t>())
397  : execute_parallel_row_count(threadpool::FuturesThreadPool<size_t>());
398 
399  return get_truncated_row_count(row_count, getLimit(), drop_first_);
400 }
401 
403  return !storage_ && !estimator_ && !just_explain_;
404 }
405 
407  CHECK(storage_);
408  return storage_->query_mem_desc_;
409 }
410 
411 const std::vector<TargetInfo>& ResultSet::getTargetInfos() const {
412  return targets_;
413 }
414 
415 const std::vector<int64_t>& ResultSet::getTargetInitVals() const {
416  CHECK(storage_);
417  return storage_->target_init_vals_;
418 }
419 
421  CHECK(device_type_ == ExecutorDeviceType::GPU);
422  CHECK(device_estimator_buffer_);
423  return device_estimator_buffer_->getMemoryPtr();
424 }
425 
427  return host_estimator_buffer_;
428 }
429 
431  CHECK(device_type_ == ExecutorDeviceType::GPU);
432  CHECK(!host_estimator_buffer_);
433  CHECK_EQ(size_t(0), estimator_->getBufferSize() % sizeof(int64_t));
434  host_estimator_buffer_ =
435  static_cast<int8_t*>(checked_calloc(estimator_->getBufferSize(), 1));
436  CHECK(device_estimator_buffer_);
437  auto device_buffer_ptr = device_estimator_buffer_->getMemoryPtr();
439  host_estimator_buffer_,
440  reinterpret_cast<CUdeviceptr>(device_buffer_ptr),
441  estimator_->getBufferSize(),
442  device_id_);
443 }
444 
445 void ResultSet::setQueueTime(const int64_t queue_time) {
446  timings_.executor_queue_time = queue_time;
447 }
448 
449 void ResultSet::setKernelQueueTime(const int64_t kernel_queue_time) {
450  timings_.kernel_queue_time = kernel_queue_time;
451 }
452 
453 void ResultSet::addCompilationQueueTime(const int64_t compilation_queue_time) {
454  timings_.compilation_queue_time += compilation_queue_time;
455 }
456 
457 int64_t ResultSet::getQueueTime() const {
458  return timings_.executor_queue_time + timings_.kernel_queue_time +
459  timings_.compilation_queue_time;
460 }
461 
462 int64_t ResultSet::getRenderTime() const {
463  return timings_.render_time;
464 }
465 
467  crt_row_buff_idx_ = 0;
468  fetched_so_far_ = 0;
469 }
470 
472  return keep_first_ + drop_first_;
473 }
474 
475 bool ResultSet::isExplain() const {
476  return just_explain_;
477 }
478 
480  for_validation_only_ = true;
481 }
482 
484  return for_validation_only_;
485 }
486 
488  return device_id_;
489 }
490 
493  auto query_mem_desc_copy = query_mem_desc;
494  query_mem_desc_copy.resetGroupColWidths(
495  std::vector<int8_t>(query_mem_desc_copy.getGroupbyColCount(), 8));
496  if (query_mem_desc.didOutputColumnar()) {
497  return query_mem_desc_copy;
498  }
499  query_mem_desc_copy.alignPaddedSlots();
500  return query_mem_desc_copy;
501 }
502 
503 void ResultSet::sort(const std::list<Analyzer::OrderEntry>& order_entries,
504  size_t top_n,
505  const Executor* executor) {
506  auto timer = DEBUG_TIMER(__func__);
507 
508  if (!storage_) {
509  return;
510  }
511  CHECK_EQ(-1, cached_row_count_);
512  CHECK(!targets_.empty());
513 #ifdef HAVE_CUDA
514  if (canUseFastBaselineSort(order_entries, top_n)) {
515  baselineSort(order_entries, top_n, executor);
516  return;
517  }
518 #endif // HAVE_CUDA
519  if (query_mem_desc_.sortOnGpu()) {
520  try {
521  radixSortOnGpu(order_entries);
522  } catch (const OutOfMemory&) {
523  LOG(WARNING) << "Out of GPU memory during sort, finish on CPU";
524  radixSortOnCpu(order_entries);
525  } catch (const std::bad_alloc&) {
526  LOG(WARNING) << "Out of GPU memory during sort, finish on CPU";
527  radixSortOnCpu(order_entries);
528  }
529  return;
530  }
531  // This check isn't strictly required, but allows the index buffer to be 32-bit.
532  if (query_mem_desc_.getEntryCount() > std::numeric_limits<uint32_t>::max()) {
533  throw RowSortException("Sorting more than 4B elements not supported");
534  }
535 
536  CHECK(permutation_.empty());
537 
538  if (top_n && g_parallel_top_min < entryCount()) {
539  if (g_enable_watchdog && g_parallel_top_max < entryCount()) {
540  throw WatchdogException("Sorting the result would be too slow");
541  }
542  parallelTop(order_entries, top_n, executor);
543  } else {
544  if (g_enable_watchdog && Executor::baseline_threshold < entryCount()) {
545  throw WatchdogException("Sorting the result would be too slow");
546  }
547  permutation_.resize(query_mem_desc_.getEntryCount());
548  // PermutationView is used to share common API with parallelTop().
549  PermutationView pv(permutation_.data(), 0, permutation_.size());
550  pv = initPermutationBuffer(pv, 0, permutation_.size());
551  if (top_n == 0) {
552  top_n = pv.size(); // top_n == 0 implies a full sort
553  }
554  pv = topPermutation(pv, top_n, createComparator(order_entries, pv, executor, false));
555  if (pv.size() < permutation_.size()) {
556  permutation_.resize(pv.size());
557  permutation_.shrink_to_fit();
558  }
559  }
560 }
561 
562 #ifdef HAVE_CUDA
563 void ResultSet::baselineSort(const std::list<Analyzer::OrderEntry>& order_entries,
564  const size_t top_n,
565  const Executor* executor) {
566  auto timer = DEBUG_TIMER(__func__);
567  // If we only have on GPU, it's usually faster to do multi-threaded radix sort on CPU
568  if (getGpuCount() > 1) {
569  try {
570  doBaselineSort(ExecutorDeviceType::GPU, order_entries, top_n, executor);
571  } catch (...) {
572  doBaselineSort(ExecutorDeviceType::CPU, order_entries, top_n, executor);
573  }
574  } else {
575  doBaselineSort(ExecutorDeviceType::CPU, order_entries, top_n, executor);
576  }
577 }
578 #endif // HAVE_CUDA
579 
580 // Append non-empty indexes i in [begin,end) from findStorage(i) to permutation.
582  PermutationIdx const begin,
583  PermutationIdx const end) const {
584  auto timer = DEBUG_TIMER(__func__);
585  for (PermutationIdx i = begin; i < end; ++i) {
586  const auto storage_lookup_result = findStorage(i);
587  const auto lhs_storage = storage_lookup_result.storage_ptr;
588  const auto off = storage_lookup_result.fixedup_entry_idx;
589  CHECK(lhs_storage);
590  if (!lhs_storage->isEmptyEntry(off)) {
591  permutation.push_back(i);
592  }
593  }
594  return permutation;
595 }
596 
598  return permutation_;
599 }
600 
601 void ResultSet::parallelTop(const std::list<Analyzer::OrderEntry>& order_entries,
602  const size_t top_n,
603  const Executor* executor) {
604  auto timer = DEBUG_TIMER(__func__);
605  const size_t nthreads = cpu_threads();
606 
607  // Split permutation_ into nthreads subranges and top-sort in-place.
608  permutation_.resize(query_mem_desc_.getEntryCount());
609  std::vector<PermutationView> permutation_views(nthreads);
610  const auto top_sort_interval = [&, top_n, executor](const auto interval) {
611  PermutationView pv(permutation_.data() + interval.begin, 0, interval.size());
612  pv = initPermutationBuffer(pv, interval.begin, interval.end);
613  const auto compare = createComparator(order_entries, pv, executor, true);
614  permutation_views[interval.index] = topPermutation(pv, top_n, compare);
615  };
616  threadpool::FuturesThreadPool<void> top_sort_threads;
617  for (auto interval : makeIntervals<PermutationIdx>(0, permutation_.size(), nthreads)) {
618  top_sort_threads.spawn(top_sort_interval, interval);
619  }
620  top_sort_threads.join();
621 
622  // In case you are considering implementing a parallel reduction, note that the
623  // ResultSetComparator constructor is O(N) in order to materialize some of the aggregate
624  // columns as necessary to perform a comparison. This cost is why reduction is chosen to
625  // be serial instead; only one more Comparator is needed below.
626 
627  // Left-copy disjoint top-sorted subranges into one contiguous range.
628  // ++++....+++.....+++++... -> ++++++++++++............
629  auto end = permutation_.begin() + permutation_views.front().size();
630  for (size_t i = 1; i < nthreads; ++i) {
631  std::copy(permutation_views[i].begin(), permutation_views[i].end(), end);
632  end += permutation_views[i].size();
633  }
634 
635  // Top sort final range.
636  PermutationView pv(permutation_.data(), end - permutation_.begin());
637  const auto compare = createComparator(order_entries, pv, executor, false);
638  pv = topPermutation(pv, top_n, compare);
639  permutation_.resize(pv.size());
640  permutation_.shrink_to_fit();
641 }
642 
643 std::pair<size_t, size_t> ResultSet::getStorageIndex(const size_t entry_idx) const {
644  size_t fixedup_entry_idx = entry_idx;
645  auto entry_count = storage_->query_mem_desc_.getEntryCount();
646  const bool is_rowwise_layout = !storage_->query_mem_desc_.didOutputColumnar();
647  if (fixedup_entry_idx < entry_count) {
648  return {0, fixedup_entry_idx};
649  }
650  fixedup_entry_idx -= entry_count;
651  for (size_t i = 0; i < appended_storage_.size(); ++i) {
652  const auto& desc = appended_storage_[i]->query_mem_desc_;
653  CHECK_NE(is_rowwise_layout, desc.didOutputColumnar());
654  entry_count = desc.getEntryCount();
655  if (fixedup_entry_idx < entry_count) {
656  return {i + 1, fixedup_entry_idx};
657  }
658  fixedup_entry_idx -= entry_count;
659  }
660  UNREACHABLE() << "entry_idx = " << entry_idx << ", query_mem_desc_.getEntryCount() = "
661  << query_mem_desc_.getEntryCount();
662  return {};
663 }
664 
667 
669  auto [stg_idx, fixedup_entry_idx] = getStorageIndex(entry_idx);
670  return {stg_idx ? appended_storage_[stg_idx - 1].get() : storage_.get(),
671  fixedup_entry_idx,
672  stg_idx};
673 }
674 
675 template <typename BUFFER_ITERATOR_TYPE>
677  BUFFER_ITERATOR_TYPE>::materializeCountDistinctColumns() {
678  for (const auto& order_entry : order_entries_) {
679  if (is_distinct_target(result_set_->targets_[order_entry.tle_no - 1])) {
680  count_distinct_materialized_buffers_.emplace_back(
681  materializeCountDistinctColumn(order_entry));
682  }
683  }
684 }
685 
686 template <typename BUFFER_ITERATOR_TYPE>
688  BUFFER_ITERATOR_TYPE>::materializeApproxQuantileColumns() const {
689  ResultSet::ApproxQuantileBuffers approx_quantile_materialized_buffers;
690  for (const auto& order_entry : order_entries_) {
691  if (result_set_->targets_[order_entry.tle_no - 1].agg_kind == kAPPROX_QUANTILE) {
692  approx_quantile_materialized_buffers.emplace_back(
693  materializeApproxQuantileColumn(order_entry));
694  }
695  }
696  return approx_quantile_materialized_buffers;
697 }
698 
699 template <typename BUFFER_ITERATOR_TYPE>
700 std::vector<int64_t>
702  const Analyzer::OrderEntry& order_entry) const {
703  const size_t num_storage_entries = result_set_->query_mem_desc_.getEntryCount();
704  std::vector<int64_t> count_distinct_materialized_buffer(num_storage_entries);
705  const CountDistinctDescriptor count_distinct_descriptor =
706  result_set_->query_mem_desc_.getCountDistinctDescriptor(order_entry.tle_no - 1);
707  const size_t num_non_empty_entries = permutation_.size();
708  const auto work = [&](const size_t start, const size_t end) {
709  for (size_t i = start; i < end; ++i) {
710  const PermutationIdx permuted_idx = permutation_[i];
711  const auto storage_lookup_result = result_set_->findStorage(permuted_idx);
712  const auto storage = storage_lookup_result.storage_ptr;
713  const auto off = storage_lookup_result.fixedup_entry_idx;
714  const auto value = buffer_itr_.getColumnInternal(
715  storage->buff_, off, order_entry.tle_no - 1, storage_lookup_result);
716  count_distinct_materialized_buffer[permuted_idx] =
717  count_distinct_set_size(value.i1, count_distinct_descriptor);
718  }
719  };
720  // TODO(tlm): Allow use of tbb after we determine how to easily encapsulate the choice
721  // between thread pool types
722  if (single_threaded_) {
723  work(0, num_non_empty_entries);
724  } else {
726  for (auto interval : makeIntervals<size_t>(0, num_non_empty_entries, cpu_threads())) {
727  thread_pool.spawn(work, interval.begin, interval.end);
728  }
729  thread_pool.join();
730  }
731  return count_distinct_materialized_buffer;
732 }
733 
735  static_assert(sizeof(int64_t) == sizeof(quantile::TDigest*));
736  CHECK(t_digest);
737  t_digest->mergeBuffer();
738  double const quantile = t_digest->quantile();
739  return boost::math::isnan(quantile) ? NULL_DOUBLE : quantile;
740 }
741 
742 template <typename BUFFER_ITERATOR_TYPE>
743 ResultSet::ApproxQuantileBuffers::value_type
745  const Analyzer::OrderEntry& order_entry) const {
746  ResultSet::ApproxQuantileBuffers::value_type materialized_buffer(
747  result_set_->query_mem_desc_.getEntryCount());
748  const size_t size = permutation_.size();
749  const auto work = [&](const size_t start, const size_t end) {
750  for (size_t i = start; i < end; ++i) {
751  const PermutationIdx permuted_idx = permutation_[i];
752  const auto storage_lookup_result = result_set_->findStorage(permuted_idx);
753  const auto storage = storage_lookup_result.storage_ptr;
754  const auto off = storage_lookup_result.fixedup_entry_idx;
755  const auto value = buffer_itr_.getColumnInternal(
756  storage->buff_, off, order_entry.tle_no - 1, storage_lookup_result);
757  materialized_buffer[permuted_idx] =
758  value.i1 ? calculateQuantile(reinterpret_cast<quantile::TDigest*>(value.i1))
759  : NULL_DOUBLE;
760  }
761  };
762  if (single_threaded_) {
763  work(0, size);
764  } else {
766  for (auto interval : makeIntervals<size_t>(0, size, cpu_threads())) {
767  thread_pool.spawn(work, interval.begin, interval.end);
768  }
769  thread_pool.join();
770  }
771  return materialized_buffer;
772 }
773 
774 template <typename BUFFER_ITERATOR_TYPE>
776  const PermutationIdx lhs,
777  const PermutationIdx rhs) const {
778  // NB: The compare function must define a strict weak ordering, otherwise
779  // std::sort will trigger a segmentation fault (or corrupt memory).
780  const auto lhs_storage_lookup_result = result_set_->findStorage(lhs);
781  const auto rhs_storage_lookup_result = result_set_->findStorage(rhs);
782  const auto lhs_storage = lhs_storage_lookup_result.storage_ptr;
783  const auto rhs_storage = rhs_storage_lookup_result.storage_ptr;
784  const auto fixedup_lhs = lhs_storage_lookup_result.fixedup_entry_idx;
785  const auto fixedup_rhs = rhs_storage_lookup_result.fixedup_entry_idx;
786  size_t materialized_count_distinct_buffer_idx{0};
787  size_t materialized_approx_quantile_buffer_idx{0};
788 
789  for (const auto& order_entry : order_entries_) {
790  CHECK_GE(order_entry.tle_no, 1);
791  const auto& agg_info = result_set_->targets_[order_entry.tle_no - 1];
792  const auto entry_ti = get_compact_type(agg_info);
793  bool float_argument_input = takes_float_argument(agg_info);
794  // Need to determine if the float value has been stored as float
795  // or if it has been compacted to a different (often larger 8 bytes)
796  // in distributed case the floats are actually 4 bytes
797  // TODO the above takes_float_argument() is widely used wonder if this problem
798  // exists elsewhere
799  if (entry_ti.get_type() == kFLOAT) {
800  const auto is_col_lazy =
801  !result_set_->lazy_fetch_info_.empty() &&
802  result_set_->lazy_fetch_info_[order_entry.tle_no - 1].is_lazily_fetched;
803  if (result_set_->query_mem_desc_.getPaddedSlotWidthBytes(order_entry.tle_no - 1) ==
804  sizeof(float)) {
805  float_argument_input =
806  result_set_->query_mem_desc_.didOutputColumnar() ? !is_col_lazy : true;
807  }
808  }
809 
810  if (UNLIKELY(is_distinct_target(agg_info))) {
811  CHECK_LT(materialized_count_distinct_buffer_idx,
812  count_distinct_materialized_buffers_.size());
813 
814  const auto& count_distinct_materialized_buffer =
815  count_distinct_materialized_buffers_[materialized_count_distinct_buffer_idx];
816  const auto lhs_sz = count_distinct_materialized_buffer[lhs];
817  const auto rhs_sz = count_distinct_materialized_buffer[rhs];
818  ++materialized_count_distinct_buffer_idx;
819  if (lhs_sz == rhs_sz) {
820  continue;
821  }
822  return (lhs_sz < rhs_sz) != order_entry.is_desc;
823  } else if (UNLIKELY(agg_info.agg_kind == kAPPROX_QUANTILE)) {
824  CHECK_LT(materialized_approx_quantile_buffer_idx,
825  approx_quantile_materialized_buffers_.size());
826  const auto& approx_quantile_materialized_buffer =
827  approx_quantile_materialized_buffers_[materialized_approx_quantile_buffer_idx];
828  const auto lhs_value = approx_quantile_materialized_buffer[lhs];
829  const auto rhs_value = approx_quantile_materialized_buffer[rhs];
830  ++materialized_approx_quantile_buffer_idx;
831  if (lhs_value == rhs_value) {
832  continue;
833  } else if (!entry_ti.get_notnull()) {
834  if (lhs_value == NULL_DOUBLE) {
835  return order_entry.nulls_first;
836  } else if (rhs_value == NULL_DOUBLE) {
837  return !order_entry.nulls_first;
838  }
839  }
840  return (lhs_value < rhs_value) != order_entry.is_desc;
841  }
842 
843  const auto lhs_v = buffer_itr_.getColumnInternal(lhs_storage->buff_,
844  fixedup_lhs,
845  order_entry.tle_no - 1,
846  lhs_storage_lookup_result);
847  const auto rhs_v = buffer_itr_.getColumnInternal(rhs_storage->buff_,
848  fixedup_rhs,
849  order_entry.tle_no - 1,
850  rhs_storage_lookup_result);
851 
852  if (UNLIKELY(isNull(entry_ti, lhs_v, float_argument_input) &&
853  isNull(entry_ti, rhs_v, float_argument_input))) {
854  continue;
855  }
856  if (UNLIKELY(isNull(entry_ti, lhs_v, float_argument_input) &&
857  !isNull(entry_ti, rhs_v, float_argument_input))) {
858  return order_entry.nulls_first;
859  }
860  if (UNLIKELY(isNull(entry_ti, rhs_v, float_argument_input) &&
861  !isNull(entry_ti, lhs_v, float_argument_input))) {
862  return !order_entry.nulls_first;
863  }
864 
865  if (LIKELY(lhs_v.isInt())) {
866  CHECK(rhs_v.isInt());
867  if (UNLIKELY(entry_ti.is_string() &&
868  entry_ti.get_compression() == kENCODING_DICT)) {
869  CHECK_EQ(4, entry_ti.get_logical_size());
870  CHECK(executor_);
871  const auto string_dict_proxy = executor_->getStringDictionaryProxy(
872  entry_ti.get_comp_param(), result_set_->row_set_mem_owner_, false);
873  auto lhs_str = string_dict_proxy->getString(lhs_v.i1);
874  auto rhs_str = string_dict_proxy->getString(rhs_v.i1);
875  if (lhs_str == rhs_str) {
876  continue;
877  }
878  return (lhs_str < rhs_str) != order_entry.is_desc;
879  }
880 
881  if (lhs_v.i1 == rhs_v.i1) {
882  continue;
883  }
884  if (entry_ti.is_fp()) {
885  if (float_argument_input) {
886  const auto lhs_dval = *reinterpret_cast<const float*>(may_alias_ptr(&lhs_v.i1));
887  const auto rhs_dval = *reinterpret_cast<const float*>(may_alias_ptr(&rhs_v.i1));
888  return (lhs_dval < rhs_dval) != order_entry.is_desc;
889  } else {
890  const auto lhs_dval =
891  *reinterpret_cast<const double*>(may_alias_ptr(&lhs_v.i1));
892  const auto rhs_dval =
893  *reinterpret_cast<const double*>(may_alias_ptr(&rhs_v.i1));
894  return (lhs_dval < rhs_dval) != order_entry.is_desc;
895  }
896  }
897  return (lhs_v.i1 < rhs_v.i1) != order_entry.is_desc;
898  } else {
899  if (lhs_v.isPair()) {
900  CHECK(rhs_v.isPair());
901  const auto lhs =
902  pair_to_double({lhs_v.i1, lhs_v.i2}, entry_ti, float_argument_input);
903  const auto rhs =
904  pair_to_double({rhs_v.i1, rhs_v.i2}, entry_ti, float_argument_input);
905  if (lhs == rhs) {
906  continue;
907  }
908  return (lhs < rhs) != order_entry.is_desc;
909  } else {
910  CHECK(lhs_v.isStr() && rhs_v.isStr());
911  const auto lhs = lhs_v.strVal();
912  const auto rhs = rhs_v.strVal();
913  if (lhs == rhs) {
914  continue;
915  }
916  return (lhs < rhs) != order_entry.is_desc;
917  }
918  }
919  }
920  return false;
921 }
922 
923 // Partial sort permutation into top(least by compare) n elements.
924 // If permutation.size() <= n then sort entire permutation by compare.
925 // Return PermutationView with new size() = min(n, permutation.size()).
927  const size_t n,
928  const Comparator& compare) {
929  auto timer = DEBUG_TIMER(__func__);
930  if (n < permutation.size()) {
931  std::partial_sort(
932  permutation.begin(), permutation.begin() + n, permutation.end(), compare);
933  permutation.resize(n);
934  } else {
935  std::sort(permutation.begin(), permutation.end(), compare);
936  }
937  return permutation;
938 }
939 
941  const std::list<Analyzer::OrderEntry>& order_entries) const {
942  auto timer = DEBUG_TIMER(__func__);
943  auto data_mgr = &catalog_->getDataMgr();
944  const int device_id{0};
945  CudaAllocator cuda_allocator(data_mgr, device_id);
946  CHECK_GT(block_size_, 0);
947  CHECK_GT(grid_size_, 0);
948  std::vector<int64_t*> group_by_buffers(block_size_);
949  group_by_buffers[0] = reinterpret_cast<int64_t*>(storage_->getUnderlyingBuffer());
950  auto dev_group_by_buffers =
951  create_dev_group_by_buffers(&cuda_allocator,
952  group_by_buffers,
953  query_mem_desc_,
954  block_size_,
955  grid_size_,
956  device_id,
958  /*num_input_rows=*/-1,
959  /*prepend_index_buffer=*/true,
960  /*always_init_group_by_on_host=*/true,
961  /*use_bump_allocator=*/false,
962  /*has_varlen_output=*/false,
963  /*insitu_allocator*=*/nullptr);
965  order_entries, query_mem_desc_, dev_group_by_buffers, data_mgr, device_id);
967  data_mgr,
968  group_by_buffers,
969  query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU),
970  dev_group_by_buffers.data,
971  query_mem_desc_,
972  block_size_,
973  grid_size_,
974  device_id,
975  /*use_bump_allocator=*/false,
976  /*has_varlen_output=*/false);
977 }
978 
980  const std::list<Analyzer::OrderEntry>& order_entries) const {
981  auto timer = DEBUG_TIMER(__func__);
982  CHECK(!query_mem_desc_.hasKeylessHash());
983  std::vector<int64_t> tmp_buff(query_mem_desc_.getEntryCount());
984  std::vector<int32_t> idx_buff(query_mem_desc_.getEntryCount());
985  CHECK_EQ(size_t(1), order_entries.size());
986  auto buffer_ptr = storage_->getUnderlyingBuffer();
987  for (const auto& order_entry : order_entries) {
988  const auto target_idx = order_entry.tle_no - 1;
989  const auto sortkey_val_buff = reinterpret_cast<int64_t*>(
990  buffer_ptr + query_mem_desc_.getColOffInBytes(target_idx));
991  const auto chosen_bytes = query_mem_desc_.getPaddedSlotWidthBytes(target_idx);
992  sort_groups_cpu(sortkey_val_buff,
993  &idx_buff[0],
994  query_mem_desc_.getEntryCount(),
995  order_entry.is_desc,
996  chosen_bytes);
997  apply_permutation_cpu(reinterpret_cast<int64_t*>(buffer_ptr),
998  &idx_buff[0],
999  query_mem_desc_.getEntryCount(),
1000  &tmp_buff[0],
1001  sizeof(int64_t));
1002  for (size_t target_idx = 0; target_idx < query_mem_desc_.getSlotCount();
1003  ++target_idx) {
1004  if (static_cast<int>(target_idx) == order_entry.tle_no - 1) {
1005  continue;
1006  }
1007  const auto chosen_bytes = query_mem_desc_.getPaddedSlotWidthBytes(target_idx);
1008  const auto satellite_val_buff = reinterpret_cast<int64_t*>(
1009  buffer_ptr + query_mem_desc_.getColOffInBytes(target_idx));
1010  apply_permutation_cpu(satellite_val_buff,
1011  &idx_buff[0],
1012  query_mem_desc_.getEntryCount(),
1013  &tmp_buff[0],
1014  chosen_bytes);
1015  }
1016  }
1017 }
1018 
1019 size_t ResultSet::getLimit() const {
1020  return keep_first_;
1021 }
1022 
1023 std::shared_ptr<const std::vector<std::string>> ResultSet::getStringDictionaryPayloadCopy(
1024  const int dict_id) const {
1025  const auto sdp = row_set_mem_owner_->getOrAddStringDictProxy(
1026  dict_id, /*with_generation=*/false, catalog_);
1027  CHECK(sdp);
1028  return sdp->getDictionary()->copyStrings();
1029 }
1030 
1040  return false;
1041  } else if (query_mem_desc_.didOutputColumnar()) {
1042  return permutation_.empty() && (query_mem_desc_.getQueryDescriptionType() ==
1044  (query_mem_desc_.getQueryDescriptionType() ==
1046  query_mem_desc_.getQueryDescriptionType() ==
1048  } else {
1049  return permutation_.empty() && (query_mem_desc_.getQueryDescriptionType() ==
1051  query_mem_desc_.getQueryDescriptionType() ==
1053  }
1054 }
1055 
1057  return query_mem_desc_.didOutputColumnar() &&
1058  query_mem_desc_.getQueryDescriptionType() == QueryDescriptionType::Projection &&
1059  appended_storage_.empty() && storage_ &&
1060  (lazy_fetch_info_.empty() || !lazy_fetch_info_[column_idx].is_lazily_fetched);
1061 }
1062 
1063 const int8_t* ResultSet::getColumnarBuffer(size_t column_idx) const {
1064  CHECK(isZeroCopyColumnarConversionPossible(column_idx));
1065  return storage_->getUnderlyingBuffer() + query_mem_desc_.getColOffInBytes(column_idx);
1066 }
1067 
1068 // returns a bitmap (and total number) of all single slot targets
1069 std::tuple<std::vector<bool>, size_t> ResultSet::getSingleSlotTargetBitmap() const {
1070  std::vector<bool> target_bitmap(targets_.size(), true);
1071  size_t num_single_slot_targets = 0;
1072  for (size_t target_idx = 0; target_idx < targets_.size(); target_idx++) {
1073  const auto& sql_type = targets_[target_idx].sql_type;
1074  if (targets_[target_idx].is_agg && targets_[target_idx].agg_kind == kAVG) {
1075  target_bitmap[target_idx] = false;
1076  } else if (sql_type.is_varlen()) {
1077  target_bitmap[target_idx] = false;
1078  } else {
1079  num_single_slot_targets++;
1080  }
1081  }
1082  return std::make_tuple(std::move(target_bitmap), num_single_slot_targets);
1083 }
1084 
1093 std::tuple<std::vector<bool>, size_t> ResultSet::getSupportedSingleSlotTargetBitmap()
1094  const {
1095  CHECK(isDirectColumnarConversionPossible());
1096  auto [single_slot_targets, num_single_slot_targets] = getSingleSlotTargetBitmap();
1097 
1098  for (size_t target_idx = 0; target_idx < single_slot_targets.size(); target_idx++) {
1099  const auto& target = targets_[target_idx];
1100  if (single_slot_targets[target_idx] &&
1101  (is_distinct_target(target) || target.agg_kind == kAPPROX_QUANTILE ||
1102  (target.is_agg && target.agg_kind == kSAMPLE && target.sql_type == kFLOAT))) {
1103  single_slot_targets[target_idx] = false;
1104  num_single_slot_targets--;
1105  }
1106  }
1107  CHECK_GE(num_single_slot_targets, size_t(0));
1108  return std::make_tuple(std::move(single_slot_targets), num_single_slot_targets);
1109 }
1110 
1111 // returns the starting slot index for all targets in the result set
1112 std::vector<size_t> ResultSet::getSlotIndicesForTargetIndices() const {
1113  std::vector<size_t> slot_indices(targets_.size(), 0);
1114  size_t slot_index = 0;
1115  for (size_t target_idx = 0; target_idx < targets_.size(); target_idx++) {
1116  slot_indices[target_idx] = slot_index;
1117  slot_index = advance_slot(slot_index, targets_[target_idx], false);
1118  }
1119  return slot_indices;
1120 }
1121 
1122 // namespace result_set
1123 
1124 bool result_set::can_use_parallel_algorithms(const ResultSet& rows) {
1125  return !rows.isTruncated();
1126 }
1127 
1128 bool result_set::use_parallel_algorithms(const ResultSet& rows) {
1129  return result_set::can_use_parallel_algorithms(rows) && rows.entryCount() >= 20000;
1130 }
bool is_agg(const Analyzer::Expr *expr)
catalog_(nullptr)
void syncEstimatorBuffer() const
Definition: ResultSet.cpp:430
#define CHECK_EQ(x, y)
Definition: Logger.h:214
const QueryMemoryDescriptor & getQueryMemDesc() const
Definition: ResultSet.cpp:406
void sort_groups_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes)
Definition: InPlaceSort.cpp:27
size_t g_parallel_top_max
Definition: ResultSet.cpp:48
std::pair< size_t, size_t > getStorageIndex(const size_t entry_idx) const
Definition: ResultSet.cpp:643
#define NULL_DOUBLE
DEVICE void push_back(T const &value)
Definition: VectorView.h:74
bool isValidationOnlyRes() const
Definition: ResultSet.cpp:483
bool g_enable_watchdog
void setValidationOnlyRes()
Definition: ResultSet.cpp:479
PermutationView initPermutationBuffer(PermutationView permutation, PermutationIdx const begin, PermutationIdx const end) const
Definition: ResultSet.cpp:581
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:102
bool g_enable_direct_columnarization
Definition: Execute.cpp:111
ExecutorDeviceType
void moveToBegin() const
Definition: ResultSet.cpp:466
#define LOG(tag)
Definition: Logger.h:200
static const size_t baseline_threshold
Definition: Execute.h:1067
ResultSet(const std::vector< TargetInfo > &targets, const ExecutorDeviceType device_type, const QueryMemoryDescriptor &query_mem_desc, const std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const Catalog_Namespace::Catalog *catalog, const unsigned block_size, const unsigned grid_size)
Definition: ResultSet.cpp:60
int tle_no
Definition: Analyzer.h:1424
#define UNREACHABLE()
Definition: Logger.h:250
DEVICE void sort(ARGS &&...args)
Definition: gpu_enabled.h:105
const std::vector< TargetInfo > & getTargetInfos() const
Definition: ResultSet.cpp:411
#define CHECK_GE(x, y)
Definition: Logger.h:219
void setKernelQueueTime(const int64_t kernel_queue_time)
Definition: ResultSet.cpp:449
size_t rowCount(const bool force_parallel=false) const
Definition: ResultSet.cpp:309
DEVICE void mergeBuffer()
Definition: quantile.h:629
void keepFirstN(const size_t n)
Definition: ResultSet.cpp:50
double pair_to_double(const std::pair< int64_t, int64_t > &fp_pair, const SQLTypeInfo &ti, const bool float_argument_input)
void addCompilationQueueTime(const int64_t compilation_queue_time)
Definition: ResultSet.cpp:453
std::vector< std::vector< double >> ApproxQuantileBuffers
Definition: ResultSet.h:623
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:157
void parallelTop(const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n, const Executor *executor)
Definition: ResultSet.cpp:601
size_t colCount() const
Definition: ResultSet.cpp:278
void inplace_sort_gpu(const std::list< Analyzer::OrderEntry > &order_entries, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &group_by_buffers, Data_Namespace::DataMgr *data_mgr, const int device_id)
void apply_permutation_cpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, int64_t *tmp_buff, const uint32_t chosen_bytes)
Definition: InPlaceSort.cpp:46
DEVICE void resize(size_type const size)
Definition: VectorView.h:75
#define CHECK_GT(x, y)
Definition: Logger.h:218
size_t getLimit() const
Definition: ResultSet.cpp:1019
std::vector< int64_t > materializeCountDistinctColumn(const Analyzer::OrderEntry &order_entry) const
Definition: ResultSet.cpp:701
ApproxQuantileBuffers::value_type materializeApproxQuantileColumn(const Analyzer::OrderEntry &order_entry) const
Definition: ResultSet.cpp:744
bool isTruncated() const
Definition: ResultSet.cpp:471
size_t get_truncated_row_count(size_t total_row_count, size_t limit, size_t offset)
Definition: ResultSet.cpp:293
size_t parallelRowCount() const
Definition: ResultSet.cpp:367
void radixSortOnCpu(const std::list< Analyzer::OrderEntry > &order_entries) const
Definition: ResultSet.cpp:979
const SQLTypeInfo get_compact_type(const TargetInfo &target)
bool definitelyHasNoRows() const
Definition: ResultSet.cpp:402
bool use_parallel_algorithms(const ResultSet &rows)
Definition: ResultSet.cpp:1128
bool isZeroCopyColumnarConversionPossible(size_t column_idx) const
Definition: ResultSet.cpp:1056
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
size_t g_parallel_top_min
Definition: ResultSet.cpp:47
int8_t * getHostEstimatorBuffer() const
Definition: ResultSet.cpp:426
DEVICE size_type size() const
Definition: VectorView.h:84
const ResultSetStorage * allocateStorage() const
std::shared_ptr< const std::vector< std::string > > getStringDictionaryPayloadCopy(const int dict_id) const
Definition: ResultSet.cpp:1023
size_t advance_slot(const size_t j, const TargetInfo &target_info, const bool separate_varlen_storage)
int64_t count_distinct_set_size(const int64_t set_handle, const CountDistinctDescriptor &count_distinct_desc)
Definition: CountDistinct.h:75
void sort(const std::list< Analyzer::OrderEntry > &order_entries, size_t top_n, const Executor *executor)
Definition: ResultSet.cpp:503
DEVICE auto copy(ARGS &&...args)
Definition: gpu_enabled.h:51
void setQueueTime(const int64_t queue_time)
Definition: ResultSet.cpp:445
#define CHECK_NE(x, y)
Definition: Logger.h:215
void dropFirstN(const size_t n)
Definition: ResultSet.cpp:55
DEVICE T * begin() const
Definition: VectorView.h:60
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
std::vector< PermutationIdx > Permutation
Definition: ResultSet.h:153
std::tuple< std::vector< bool >, size_t > getSingleSlotTargetBitmap() const
Definition: ResultSet.cpp:1069
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
#define LIKELY(x)
Definition: likely.h:24
void * checked_calloc(const size_t nmemb, const size_t size)
Definition: checked_alloc.h:53
StorageLookupResult findStorage(const size_t entry_idx) const
Definition: ResultSet.cpp:668
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
std::function< bool(const PermutationIdx, const PermutationIdx)> Comparator
Definition: ResultSet.h:155
bool g_enable_smem_group_by true
static double calculateQuantile(quantile::TDigest *const t_digest)
Definition: ResultSet.cpp:734
void radixSortOnGpu(const std::list< Analyzer::OrderEntry > &order_entries) const
Definition: ResultSet.cpp:940
const ResultSetStorage * getStorage() const
Definition: ResultSet.cpp:274
int64_t getQueueTime() const
Definition: ResultSet.cpp:457
#define UNLIKELY(x)
Definition: likely.h:25
uint32_t PermutationIdx
Definition: ResultSet.h:152
#define CHECK_LT(x, y)
Definition: Logger.h:216
Definition: sqltypes.h:51
SQLTypeInfo getColType(const size_t col_idx) const
Definition: ResultSet.cpp:282
std::tuple< std::vector< bool >, size_t > getSupportedSingleSlotTargetBitmap() const
Definition: ResultSet.cpp:1093
ExecutorDeviceType getDeviceType() const
Definition: ResultSet.cpp:195
const int8_t * getColumnarBuffer(size_t column_idx) const
Definition: ResultSet.cpp:1063
bool isExplain() const
Definition: ResultSet.cpp:475
void spawn(Function &&f, Args &&...args)
Definition: threadpool.h:33
DEVICE RealType quantile(IndexType *buf, RealType const q)
Definition: quantile.h:793
static Data_Namespace::AbstractBuffer * allocGpuAbstractBuffer(Data_Namespace::DataMgr *data_mgr, const size_t num_bytes, const int device_id)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:491
void baselineSort(const std::list< Analyzer::OrderEntry > &order_entries, const size_t top_n, const Executor *executor)
const Permutation & getPermutationBuffer() const
Definition: ResultSet.cpp:597
void append(ResultSet &that)
Definition: ResultSet.cpp:244
data_mgr_(data_mgr)
static PermutationView topPermutation(PermutationView, const size_t n, const Comparator &)
Definition: ResultSet.cpp:926
size_t getCurrentRowBufferIndex() const
Definition: ResultSet.cpp:236
bool g_enable_watchdog false
Definition: Execute.cpp:75
#define CHECK(condition)
Definition: Logger.h:206
#define DEBUG_TIMER(name)
Definition: Logger.h:322
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
int8_t * getDeviceEstimatorBuffer() const
Definition: ResultSet.cpp:420
bool operator()(const PermutationIdx lhs, const PermutationIdx rhs) const
Definition: ResultSet.cpp:775
Basic constructors and methods of the row set interface.
const std::vector< int64_t > & getTargetInitVals() const
Definition: ResultSet.cpp:415
std::vector< size_t > getSlotIndicesForTargetIndices() const
Definition: ResultSet.cpp:1112
Allocate GPU memory using GpuBuffers via DataMgr.
Definition: Analyzer.h:1419
int cpu_threads()
Definition: thread_count.h:24
bool g_use_tbb_pool
Definition: Execute.cpp:77
Definition: sqldefs.h:72
bool can_use_parallel_algorithms(const ResultSet &rows)
Definition: ResultSet.cpp:1124
int64_t getRenderTime() const
Definition: ResultSet.cpp:462
void setCachedRowCount(const size_t row_count) const
Definition: ResultSet.cpp:349
bool isDirectColumnarConversionPossible() const
Definition: ResultSet.cpp:1038
size_t binSearchRowCount() const
Definition: ResultSet.cpp:354
int getDeviceId() const
Definition: ResultSet.cpp:487
DEVICE T * end() const
Definition: VectorView.h:68