OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryExecutionContext Class Reference

#include <QueryExecutionContext.h>

+ Inheritance diagram for QueryExecutionContext:
+ Collaboration diagram for QueryExecutionContext:

Public Member Functions

 QueryExecutionContext (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const bool output_columnar, const bool sort_on_gpu, const size_t thread_idx, RenderInfo *)
 
ResultSetPtr getRowSet (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc) const
 
ResultSetPtr groupBufferToResults (const size_t i) const
 
std::vector< int64_t * > launchGpuCode (const RelAlgExecutionUnit &ra_exe_unit, const CompilationContext *compilation_context, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, Data_Namespace::DataMgr *data_mgr, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const size_t shared_memory_size, int32_t *error_code, const uint32_t num_tables, const bool allow_runtime_interrupt, const std::vector< int8_t * > &join_hash_tables, RenderAllocatorMap *render_allocator_map, bool optimize_cuda_block_and_grid_sizes)
 
std::vector< int64_t * > launchCpuCode (const RelAlgExecutionUnit &ra_exe_unit, const CpuCompilationContext *fn_ptrs, const bool hoist_literals, const std::vector< int8_t > &literal_buff, std::vector< std::vector< const int8_t * >> col_buffers, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_row_offsets, const int32_t scan_limit, int32_t *error_code, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, const int64_t num_rows_to_process=-1)
 
int64_t getAggInitValForIndex (const size_t index) const
 

Private Types

enum  {
  COL_BUFFERS, NUM_FRAGMENTS, LITERALS, NUM_ROWS,
  FRAG_ROW_OFFSETS, MAX_MATCHED, TOTAL_MATCHED, INIT_AGG_VALS,
  GROUPBY_BUF, ERROR_CODE, NUM_TABLES, JOIN_HASH_TABLES,
  ROW_FUNC_MGR, KERN_PARAM_COUNT
}
 

Private Member Functions

std::vector< int8_t * > prepareKernelParams (const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< int8_t > &literal_buff, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, const int32_t scan_limit, const std::vector< int64_t > &init_agg_vals, const std::vector< int32_t > &error_codes, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, Data_Namespace::DataMgr *data_mgr, const int device_id, const bool hoist_literals, const bool is_group_by) const
 
ResultSetPtr groupBufferToDeinterleavedResults (const size_t i) const
 

Private Attributes

std::unique_ptr< DeviceAllocatorgpu_allocator_
 
QueryMemoryDescriptor query_mem_desc_
 
const Executorexecutor_
 
const ExecutorDeviceType device_type_
 
const ExecutorDispatchMode dispatch_mode_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
const bool output_columnar_
 
std::unique_ptr
< QueryMemoryInitializer
query_buffers_
 
std::unique_ptr< ResultSetestimator_result_set_
 

Friends

class Executor
 

Detailed Description

Definition at line 39 of file QueryExecutionContext.h.

Member Enumeration Documentation

anonymous enum
private
Enumerator
COL_BUFFERS 
NUM_FRAGMENTS 
LITERALS 
NUM_ROWS 
FRAG_ROW_OFFSETS 
MAX_MATCHED 
TOTAL_MATCHED 
INIT_AGG_VALS 
GROUPBY_BUF 
ERROR_CODE 
NUM_TABLES 
JOIN_HASH_TABLES 
ROW_FUNC_MGR 
KERN_PARAM_COUNT 

Definition at line 101 of file QueryExecutionContext.h.

Constructor & Destructor Documentation

QueryExecutionContext::QueryExecutionContext ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const Executor executor,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const int  device_id,
const shared::TableKey outer_table_key,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const bool  output_columnar,
const bool  sort_on_gpu,
const size_t  thread_idx,
RenderInfo render_info 
)

Definition at line 33 of file QueryExecutionContext.cpp.

References CHECK, getQueryEngineCudaStreamForDevice(), GPU, gpu_allocator_, heavyai::InSituFlagsOwnerInterface::isInSitu(), query_buffers_, query_mem_desc, RenderInfo::render_allocator_map_ptr, and sort_on_gpu().

49  : query_mem_desc_(query_mem_desc)
50  , executor_(executor)
51  , device_type_(device_type)
52  , dispatch_mode_(dispatch_mode)
53  , row_set_mem_owner_(row_set_mem_owner)
54  , output_columnar_(output_columnar) {
55  CHECK(executor);
56  auto data_mgr = executor->getDataMgr();
57  if (device_type == ExecutorDeviceType::GPU) {
58  gpu_allocator_ = std::make_unique<CudaAllocator>(
59  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
60  }
61 
62  auto render_allocator_map = render_info && render_info->isInSitu()
63  ? render_info->render_allocator_map_ptr.get()
64  : nullptr;
65  query_buffers_ = std::make_unique<QueryMemoryInitializer>(ra_exe_unit,
67  device_id,
68  device_type,
69  dispatch_mode,
70  output_columnar,
72  outer_table_key,
73  num_rows,
74  col_buffers,
75  frag_offsets,
76  render_allocator_map,
77  render_info,
78  row_set_mem_owner,
79  gpu_allocator_.get(),
80  thread_idx,
81  executor);
82 }
std::unique_ptr< DeviceAllocator > gpu_allocator_
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const ExecutorDispatchMode dispatch_mode_
const ExecutorDeviceType device_type_
std::unique_ptr< QueryMemoryInitializer > query_buffers_
QueryMemoryDescriptor query_mem_desc_
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::unique_ptr< RenderAllocatorMap > render_allocator_map_ptr
Definition: RenderInfo.h:32
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

Member Function Documentation

int64_t QueryExecutionContext::getAggInitValForIndex ( const size_t  index) const

Definition at line 152 of file QueryExecutionContext.cpp.

References CHECK, and query_buffers_.

Referenced by Executor::executePlanWithoutGroupBy().

152  {
154  return query_buffers_->getAggInitValForIndex(index);
155 }
std::unique_ptr< QueryMemoryInitializer > query_buffers_
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

ResultSetPtr QueryExecutionContext::getRowSet ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc 
) const

Definition at line 157 of file QueryExecutionContext.cpp.

References CHECK, CHECK_EQ, CPU, DEBUG_TIMER, device_type_, executor_, GPU, groupBufferToResults(), QueryMemoryDescriptor::hasVarlenOutput(), query_buffers_, query_mem_desc_, row_set_mem_owner_, and QueryMemoryDescriptor::threadsShareMemory().

Referenced by Executor::executePlanWithGroupBy().

159  {
160  auto timer = DEBUG_TIMER(__func__);
161  std::vector<std::pair<ResultSetPtr, std::vector<size_t>>> results_per_sm;
163  const auto group_by_buffers_size = query_buffers_->getNumBuffers();
165  const size_t expected_num_buffers = query_mem_desc.hasVarlenOutput() ? 2 : 1;
166  CHECK_EQ(expected_num_buffers, group_by_buffers_size);
167  return groupBufferToResults(0);
168  }
169  const size_t step{query_mem_desc_.threadsShareMemory() ? executor_->blockSize() : 1};
170  const size_t group_by_output_buffers_size =
171  group_by_buffers_size - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
172  for (size_t i = 0; i < group_by_output_buffers_size; i += step) {
173  results_per_sm.emplace_back(groupBufferToResults(i), std::vector<size_t>{});
174  }
176  return executor_->reduceMultiDeviceResults(
177  ra_exe_unit, results_per_sm, row_set_mem_owner_, query_mem_desc);
178 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ExecutorDeviceType device_type_
std::unique_ptr< QueryMemoryInitializer > query_buffers_
QueryMemoryDescriptor query_mem_desc_
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ResultSetPtr groupBufferToResults(const size_t i) const
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:411

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr QueryExecutionContext::groupBufferToDeinterleavedResults ( const size_t  i) const
private

Definition at line 84 of file QueryExecutionContext.cpp.

References CHECK, CPU, executor_, ResultSet::fixupQueryMemoryDescriptor(), g_enable_non_kernel_time_query_interrupt, QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getColOffInBytesInNextBin(), QueryMemoryDescriptor::getSlotCount(), output_columnar_, query_buffers_, query_mem_desc_, ResultSetStorage::reduceSingleRow(), row_set_mem_owner_, and UNLIKELY.

Referenced by groupBufferToResults().

85  {
87  const auto& result_set = query_buffers_->getResultSet(i);
88  auto deinterleaved_query_mem_desc =
90  deinterleaved_query_mem_desc.setHasInterleavedBinsOnGpu(false);
91  deinterleaved_query_mem_desc.useConsistentSlotWidthSize(8);
92 
93  auto deinterleaved_result_set =
94  std::make_shared<ResultSet>(result_set->getTargetInfos(),
95  std::vector<ColumnLazyFetchInfo>{},
96  std::vector<std::vector<const int8_t*>>{},
97  std::vector<std::vector<int64_t>>{},
98  std::vector<int64_t>{},
100  -1,
101  deinterleaved_query_mem_desc,
103  executor_->blockSize(),
104  executor_->gridSize());
105  auto deinterleaved_storage =
106  deinterleaved_result_set->allocateStorage(executor_->plan_state_->init_agg_vals_);
107  auto deinterleaved_buffer =
108  reinterpret_cast<int64_t*>(deinterleaved_storage->getUnderlyingBuffer());
109  const auto rows_ptr = result_set->getStorage()->getUnderlyingBuffer();
110  size_t deinterleaved_buffer_idx = 0;
111  const size_t agg_col_count{query_mem_desc_.getSlotCount()};
112  auto do_work = [&](const size_t bin_base_off) {
113  std::vector<int64_t> agg_vals(agg_col_count, 0);
114  memcpy(&agg_vals[0],
115  &executor_->plan_state_->init_agg_vals_[0],
116  agg_col_count * sizeof(agg_vals[0]));
117  ResultSetStorage::reduceSingleRow(rows_ptr + bin_base_off,
118  executor_->warpSize(),
119  false,
120  true,
121  agg_vals,
123  result_set->getTargetInfos(),
124  executor_->plan_state_->init_agg_vals_);
125  for (size_t agg_idx = 0; agg_idx < agg_col_count;
126  ++agg_idx, ++deinterleaved_buffer_idx) {
127  deinterleaved_buffer[deinterleaved_buffer_idx] = agg_vals[agg_idx];
128  }
129  };
131  for (size_t bin_base_off = query_mem_desc_.getColOffInBytes(0), bin_idx = 0;
132  bin_idx < result_set->entryCount();
133  ++bin_idx, bin_base_off += query_mem_desc_.getColOffInBytesInNextBin(0)) {
134  if (UNLIKELY((bin_idx & 0xFFFF) == 0 &&
135  executor_->checkNonKernelTimeInterrupted())) {
136  throw std::runtime_error(
137  "Query execution has interrupted during result set reduction");
138  }
139  do_work(bin_base_off);
140  }
141  } else {
142  for (size_t bin_base_off = query_mem_desc_.getColOffInBytes(0), bin_idx = 0;
143  bin_idx < result_set->entryCount();
144  ++bin_idx, bin_base_off += query_mem_desc_.getColOffInBytesInNextBin(0)) {
145  do_work(bin_base_off);
146  }
147  }
148  query_buffers_->resetResultSet(i);
149  return deinterleaved_result_set;
150 }
bool g_enable_non_kernel_time_query_interrupt
Definition: Execute.cpp:126
std::unique_ptr< QueryMemoryInitializer > query_buffers_
QueryMemoryDescriptor query_mem_desc_
#define UNLIKELY(x)
Definition: likely.h:25
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
#define CHECK(condition)
Definition: Logger.h:291
static bool reduceSingleRow(const int8_t *row_ptr, const int8_t warp_count, const bool is_columnar, const bool replace_bitmap_ptr_with_bitmap_sz, std::vector< int64_t > &agg_vals, const QueryMemoryDescriptor &query_mem_desc, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &agg_init_vals)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ResultSetPtr QueryExecutionContext::groupBufferToResults ( const size_t  i) const

Definition at line 180 of file QueryExecutionContext.cpp.

References device_type_, groupBufferToDeinterleavedResults(), QueryMemoryDescriptor::interleavedBins(), query_buffers_, and query_mem_desc_.

Referenced by getRowSet().

180  {
183  }
184  return query_buffers_->getResultSetOwned(i);
185 }
const ExecutorDeviceType device_type_
std::unique_ptr< QueryMemoryInitializer > query_buffers_
QueryMemoryDescriptor query_mem_desc_
ResultSetPtr groupBufferToDeinterleavedResults(const size_t i) const
bool interleavedBins(const ExecutorDeviceType) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< int64_t * > QueryExecutionContext::launchCpuCode ( const RelAlgExecutionUnit ra_exe_unit,
const CpuCompilationContext fn_ptrs,
const bool  hoist_literals,
const std::vector< int8_t > &  literal_buff,
std::vector< std::vector< const int8_t * >>  col_buffers,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_row_offsets,
const int32_t  scan_limit,
int32_t *  error_code,
const uint32_t  num_tables,
const std::vector< int8_t * > &  join_hash_tables,
const int64_t  num_rows_to_process = -1 
)

Definition at line 556 of file QueryExecutionContext.cpp.

References align_to_int64(), CHECK, CHECK_EQ, compact_init_vals(), CPU, DEBUG_TIMER, QueryMemoryDescriptor::didOutputColumnar(), RelAlgExecutionUnit::estimator, estimator_result_set_, executor_, CpuCompilationContext::func(), QueryMemoryDescriptor::getColsSize(), QueryMemoryDescriptor::getQueryDescriptionType(), INJECT_TIMER, QueryMemoryDescriptor::isGroupBy(), foreign_storage::num_rows_to_process(), Projection, query_buffers_, query_mem_desc_, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by Executor::executePlanWithGroupBy(), and Executor::executePlanWithoutGroupBy().

568  {
569  auto timer = DEBUG_TIMER(__func__);
570  INJECT_TIMER(lauchCpuCode);
571 
573  const auto& init_agg_vals = query_buffers_->init_agg_vals_;
574 
575  std::vector<const int8_t**> multifrag_col_buffers;
576  for (auto& col_buffer : col_buffers) {
577  multifrag_col_buffers.push_back(col_buffer.empty() ? nullptr : col_buffer.data());
578  }
579  const int8_t*** multifrag_cols_ptr{
580  multifrag_col_buffers.empty() ? nullptr : &multifrag_col_buffers[0]};
581  const uint64_t num_fragments =
582  multifrag_cols_ptr ? static_cast<uint64_t>(col_buffers.size()) : uint64_t(0);
583  const auto num_out_frags = multifrag_cols_ptr ? num_fragments : uint64_t(0);
584 
585  const bool is_group_by{query_mem_desc_.isGroupBy()};
586  std::vector<int64_t*> out_vec;
587  if (ra_exe_unit.estimator) {
588  // Subfragments collect the result from multiple runs in a single
589  // result set.
590  if (!estimator_result_set_) {
591  estimator_result_set_.reset(
592  new ResultSet(ra_exe_unit.estimator, ExecutorDeviceType::CPU, 0, nullptr));
593  }
594  out_vec.push_back(
595  reinterpret_cast<int64_t*>(estimator_result_set_->getHostEstimatorBuffer()));
596  } else {
597  if (!is_group_by) {
598  for (size_t i = 0; i < init_agg_vals.size(); ++i) {
599  auto buff = new int64_t[num_out_frags];
600  out_vec.push_back(static_cast<int64_t*>(buff));
601  }
602  }
603  }
604 
605  CHECK_EQ(num_rows.size(), col_buffers.size());
606  std::vector<int64_t> flatened_num_rows;
607  for (auto& nums : num_rows) {
608  flatened_num_rows.insert(flatened_num_rows.end(), nums.begin(), nums.end());
609  }
610  std::vector<uint64_t> flatened_frag_offsets;
611  for (auto& offsets : frag_offsets) {
612  flatened_frag_offsets.insert(
613  flatened_frag_offsets.end(), offsets.begin(), offsets.end());
614  }
615  int64_t rowid_lookup_num_rows{*error_code ? *error_code + 1 : 0};
616  int64_t* num_rows_ptr;
617  if (num_rows_to_process > 0) {
618  flatened_num_rows[0] = num_rows_to_process;
619  num_rows_ptr = flatened_num_rows.data();
620  } else {
621  num_rows_ptr =
622  rowid_lookup_num_rows ? &rowid_lookup_num_rows : flatened_num_rows.data();
623  }
624  int32_t total_matched_init{0};
625 
626  std::vector<int64_t> cmpt_val_buff;
627  if (is_group_by) {
628  cmpt_val_buff =
630  init_agg_vals,
632  }
633 
634  RowFunctionManager mgr(this->executor_, ra_exe_unit);
635  int8_t* row_func_mgr_ptr = reinterpret_cast<int8_t*>(&mgr);
636 
637  CHECK(native_code);
638  const int64_t* join_hash_tables_ptr =
639  join_hash_tables.size() == 1
640  ? reinterpret_cast<const int64_t*>(join_hash_tables[0])
641  : (join_hash_tables.size() > 1
642  ? reinterpret_cast<const int64_t*>(&join_hash_tables[0])
643  : nullptr);
644  if (hoist_literals) {
645  using agg_query = void (*)(const int8_t***, // col_buffers
646  const uint64_t*, // num_fragments
647  const int8_t*, // literals
648  const int64_t*, // num_rows
649  const uint64_t*, // frag_row_offsets
650  const int32_t*, // max_matched
651  int32_t*, // total_matched
652  const int64_t*, // init_agg_value
653  int64_t**, // out
654  int32_t*, // error_code
655  const uint32_t*, // num_tables
656  const int64_t*, // join_hash_tables_ptr
657  const int8_t*); // row_func_mgr
658  if (is_group_by) {
659  reinterpret_cast<agg_query>(native_code->func())(
660  multifrag_cols_ptr,
661  &num_fragments,
662  literal_buff.data(),
663  num_rows_ptr,
664  flatened_frag_offsets.data(),
665  &scan_limit,
666  &total_matched_init,
667  cmpt_val_buff.data(),
668  query_buffers_->getGroupByBuffersPtr(),
669  error_code,
670  &num_tables,
671  join_hash_tables_ptr,
672  row_func_mgr_ptr);
673  } else {
674  reinterpret_cast<agg_query>(native_code->func())(multifrag_cols_ptr,
675  &num_fragments,
676  literal_buff.data(),
677  num_rows_ptr,
678  flatened_frag_offsets.data(),
679  &scan_limit,
680  &total_matched_init,
681  init_agg_vals.data(),
682  out_vec.data(),
683  error_code,
684  &num_tables,
685  join_hash_tables_ptr,
686  row_func_mgr_ptr);
687  }
688  } else {
689  using agg_query = void (*)(const int8_t***, // col_buffers
690  const uint64_t*, // num_fragments
691  const int64_t*, // num_rows
692  const uint64_t*, // frag_row_offsets
693  const int32_t*, // max_matched
694  int32_t*, // total_matched
695  const int64_t*, // init_agg_value
696  int64_t**, // out
697  int32_t*, // error_code
698  const uint32_t*, // num_tables
699  const int64_t*, // join_hash_tables_ptr
700  const int8_t*); // row_func_mgr
701  if (is_group_by) {
702  reinterpret_cast<agg_query>(native_code->func())(
703  multifrag_cols_ptr,
704  &num_fragments,
705  num_rows_ptr,
706  flatened_frag_offsets.data(),
707  &scan_limit,
708  &total_matched_init,
709  cmpt_val_buff.data(),
710  query_buffers_->getGroupByBuffersPtr(),
711  error_code,
712  &num_tables,
713  join_hash_tables_ptr,
714  row_func_mgr_ptr);
715  } else {
716  reinterpret_cast<agg_query>(native_code->func())(multifrag_cols_ptr,
717  &num_fragments,
718  num_rows_ptr,
719  flatened_frag_offsets.data(),
720  &scan_limit,
721  &total_matched_init,
722  init_agg_vals.data(),
723  out_vec.data(),
724  error_code,
725  &num_tables,
726  join_hash_tables_ptr,
727  row_func_mgr_ptr);
728  }
729  }
730 
731  if (ra_exe_unit.estimator) {
732  return {};
733  }
734 
735  if (rowid_lookup_num_rows && *error_code < 0) {
736  *error_code = 0;
737  }
738 
740  query_buffers_->applyStreamingTopNOffsetCpu(query_mem_desc_, ra_exe_unit);
741  }
742 
745  query_buffers_->compactProjectionBuffersCpu(query_mem_desc_, total_matched_init);
746  }
747  return out_vec;
748 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t num_rows_to_process(const size_t start_row_index, const size_t max_fragment_size, const size_t rows_remaining)
std::unique_ptr< QueryMemoryInitializer > query_buffers_
#define INJECT_TIMER(DESC)
Definition: measure.h:93
std::vector< int64_t > compact_init_vals(const size_t cmpt_size, const std::vector< int64_t > &init_vec, const QueryMemoryDescriptor &query_mem_desc)
const std::shared_ptr< Analyzer::Estimator > estimator
QueryDescriptionType getQueryDescriptionType() const
QueryMemoryDescriptor query_mem_desc_
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:411
std::unique_ptr< ResultSet > estimator_result_set_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< int64_t * > QueryExecutionContext::launchGpuCode ( const RelAlgExecutionUnit ra_exe_unit,
const CompilationContext compilation_context,
const bool  hoist_literals,
const std::vector< int8_t > &  literal_buff,
std::vector< std::vector< const int8_t * >>  col_buffers,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_row_offsets,
const int32_t  scan_limit,
Data_Namespace::DataMgr data_mgr,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const size_t  shared_memory_size,
int32_t *  error_code,
const uint32_t  num_tables,
const bool  allow_runtime_interrupt,
const std::vector< int8_t * > &  join_hash_tables,
RenderAllocatorMap render_allocator_map,
bool  optimize_cuda_block_and_grid_sizes 
)

Definition at line 206 of file QueryExecutionContext.cpp.

References anonymous_namespace{QueryExecutionContext.cpp}::aggregate_error_codes(), CHECK, CHECK_EQ, create_device_kernel(), DEBUG_TIMER, QueryMemoryDescriptor::didOutputColumnar(), dispatch_mode_, ERROR_CODE, RelAlgExecutionUnit::estimator, estimator_result_set_, executor_, g_dynamic_watchdog_time_limit, g_enable_dynamic_watchdog, get_num_allocated_rows_from_gpu(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getQueryDescriptionType(), RenderAllocatorMap::getRenderAllocator(), GPU, gpu_allocator_, GROUPBY_BUF, QueryMemoryDescriptor::hasKeylessHash(), INIT_AGG_VALS, INJECT_TIMER, inplace_sort_gpu(), QueryMemoryDescriptor::isGroupBy(), KERN_PARAM_COUNT, LITERALS, MAX_MATCHED, SortInfo::order_entries, output_columnar_, prepareKernelParams(), Projection, query_buffers_, query_mem_desc_, RelAlgExecutionUnit::sort_info, QueryMemoryDescriptor::sortOnGpu(), to_string(), TOTAL_MATCHED, RelAlgExecutionUnit::use_bump_allocator, use_speculative_top_n(), QueryMemoryDescriptor::useStreamingTopN(), QueryMemoryDescriptor::varlenOutputBufferElemSize(), and VLOG.

Referenced by Executor::executePlanWithGroupBy(), and Executor::executePlanWithoutGroupBy().

225  {
226  auto timer = DEBUG_TIMER(__func__);
227  INJECT_TIMER(lauchGpuCode);
230  CHECK(compilation_context);
231  const auto& init_agg_vals = query_buffers_->init_agg_vals_;
232 
233  bool is_group_by{query_mem_desc_.isGroupBy()};
234 
235  RenderAllocator* render_allocator = nullptr;
236  if (render_allocator_map) {
237  render_allocator = render_allocator_map->getRenderAllocator(device_id);
238  }
239 
240  auto kernel = create_device_kernel(compilation_context, device_id);
241 
242  std::vector<int64_t*> out_vec;
243  uint32_t num_fragments = col_buffers.size();
244  std::vector<int32_t> error_codes(grid_size_x * block_size_x);
245 
246  auto prepareClock = kernel->make_clock();
247  auto launchClock = kernel->make_clock();
248  auto finishClock = kernel->make_clock();
249 
250  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
251  prepareClock->start();
252  }
253 
255  kernel->initializeDynamicWatchdog(
256  executor_->interrupted_.load(),
258  }
259 
260  if (allow_runtime_interrupt && !render_allocator) {
261  kernel->initializeRuntimeInterrupter(device_id);
262  }
263 
264  auto kernel_params = prepareKernelParams(col_buffers,
265  literal_buff,
266  num_rows,
267  frag_offsets,
268  scan_limit,
269  init_agg_vals,
270  error_codes,
271  num_tables,
272  join_hash_tables,
273  data_mgr,
274  device_id,
275  hoist_literals,
276  is_group_by);
277 
278  CHECK_EQ(static_cast<size_t>(KERN_PARAM_COUNT), kernel_params.size());
279  CHECK(!kernel_params[GROUPBY_BUF]);
280 
281  const unsigned block_size_y = 1;
282  const unsigned block_size_z = 1;
283  const unsigned grid_size_y = 1;
284  const unsigned grid_size_z = 1;
285  const auto total_thread_count = block_size_x * grid_size_x;
286  const auto err_desc = kernel_params[ERROR_CODE];
287  if (is_group_by) {
288  CHECK(!(query_buffers_->getGroupByBuffersSize() == 0) || render_allocator);
289  bool can_sort_on_gpu = query_mem_desc_.sortOnGpu();
290  auto gpu_group_by_buffers =
291  query_buffers_->createAndInitializeGroupByBufferGpu(ra_exe_unit,
293  kernel_params[INIT_AGG_VALS],
294  device_id,
296  block_size_x,
297  grid_size_x,
298  executor_->warpSize(),
299  can_sort_on_gpu,
301  render_allocator);
302  const auto max_matched = static_cast<int32_t>(gpu_group_by_buffers.entry_count);
303  gpu_allocator_->copyToDevice(
304  kernel_params[MAX_MATCHED], &max_matched, sizeof(max_matched));
305 
306  kernel_params[GROUPBY_BUF] = gpu_group_by_buffers.ptrs;
307  std::vector<void*> param_ptrs;
308  for (auto& param : kernel_params) {
309  param_ptrs.push_back(&param);
310  }
311 
312  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
313  auto prepareTime = prepareClock->stop();
314  VLOG(1) << "Device " << std::to_string(device_id)
315  << ": launchGpuCode: group-by prepare: " << std::to_string(prepareTime)
316  << " ms";
317  launchClock->start();
318  }
319 
320  if (hoist_literals) {
321  kernel->launch(grid_size_x,
322  grid_size_y,
323  grid_size_z,
324  block_size_x,
325  block_size_y,
326  block_size_z,
327  shared_memory_size,
328  &param_ptrs[0],
329  optimize_cuda_block_and_grid_sizes);
330  } else {
331  param_ptrs.erase(param_ptrs.begin() + LITERALS); // TODO(alex): remove
332  kernel->launch(grid_size_x,
333  grid_size_y,
334  grid_size_z,
335  block_size_x,
336  block_size_y,
337  block_size_z,
338  shared_memory_size,
339  &param_ptrs[0],
340  optimize_cuda_block_and_grid_sizes);
341  }
342  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
343  auto launchTime = launchClock->stop();
344  VLOG(1) << "Device " << std::to_string(device_id)
345  << ": launchGpuCode: group-by cuLaunchKernel: "
346  << std::to_string(launchTime) << " ms";
347  finishClock->start();
348  }
349 
350  gpu_allocator_->copyFromDevice(reinterpret_cast<int8_t*>(error_codes.data()),
351  reinterpret_cast<int8_t*>(err_desc),
352  error_codes.size() * sizeof(error_codes[0]));
353  *error_code = aggregate_error_codes(error_codes);
354  if (*error_code > 0) {
355  return {};
356  }
357 
358  if (!render_allocator) {
360  query_buffers_->applyStreamingTopNOffsetGpu(data_mgr,
362  gpu_group_by_buffers,
363  ra_exe_unit,
364  total_thread_count,
365  device_id);
366  } else {
367  if (use_speculative_top_n(ra_exe_unit, query_mem_desc_)) {
368  try {
371  gpu_group_by_buffers,
372  data_mgr,
373  device_id);
374  } catch (const std::bad_alloc&) {
375  throw SpeculativeTopNFailed("Failed during in-place GPU sort.");
376  }
377  }
381  query_buffers_->compactProjectionBuffersGpu(
383  data_mgr,
384  gpu_group_by_buffers,
386  *gpu_allocator_, kernel_params[TOTAL_MATCHED], device_id),
387  device_id);
388  } else {
389  size_t num_allocated_rows{0};
390  if (ra_exe_unit.use_bump_allocator) {
391  num_allocated_rows = get_num_allocated_rows_from_gpu(
392  *gpu_allocator_, kernel_params[TOTAL_MATCHED], device_id);
393  // First, check the error code. If we ran out of slots, don't copy data back
394  // into the ResultSet or update ResultSet entry count
395  if (*error_code < 0) {
396  return {};
397  }
398  }
399  query_buffers_->copyGroupByBuffersFromGpu(
402  ra_exe_unit.use_bump_allocator ? num_allocated_rows
404  gpu_group_by_buffers,
405  &ra_exe_unit,
406  block_size_x,
407  grid_size_x,
408  device_id,
409  can_sort_on_gpu && query_mem_desc_.hasKeylessHash());
410  if (num_allocated_rows) {
411  CHECK(ra_exe_unit.use_bump_allocator);
412  CHECK(!query_buffers_->result_sets_.empty());
413  query_buffers_->result_sets_.front()->updateStorageEntryCount(
414  num_allocated_rows);
415  }
416  }
417  } else {
418  query_buffers_->copyGroupByBuffersFromGpu(
422  gpu_group_by_buffers,
423  &ra_exe_unit,
424  block_size_x,
425  grid_size_x,
426  device_id,
427  can_sort_on_gpu && query_mem_desc_.hasKeylessHash());
428  }
429  }
430  }
431  } else {
432  std::vector<int8_t*> out_vec_dev_buffers;
433  const size_t agg_col_count{ra_exe_unit.estimator ? size_t(1) : init_agg_vals.size()};
434  // by default, non-grouped aggregate queries generate one result per available thread
435  // in the lifetime of (potentially multi-fragment) kernel execution.
436  // We can reduce these intermediate results internally in the device and hence have
437  // only one result per device, if GPU shared memory optimizations are enabled.
438  const auto num_results_per_agg_col =
439  shared_memory_size ? 1 : block_size_x * grid_size_x * num_fragments;
440  const auto output_buffer_size_per_agg = num_results_per_agg_col * sizeof(int64_t);
441  if (ra_exe_unit.estimator) {
442  estimator_result_set_.reset(new ResultSet(
443  ra_exe_unit.estimator, ExecutorDeviceType::GPU, device_id, data_mgr));
444  out_vec_dev_buffers.push_back(estimator_result_set_->getDeviceEstimatorBuffer());
445  } else {
446  for (size_t i = 0; i < agg_col_count; ++i) {
447  int8_t* out_vec_dev_buffer =
448  num_fragments ? gpu_allocator_->alloc(output_buffer_size_per_agg) : nullptr;
449  out_vec_dev_buffers.push_back(out_vec_dev_buffer);
450  if (shared_memory_size) {
451  CHECK_EQ(output_buffer_size_per_agg, size_t(8));
452  gpu_allocator_->copyToDevice(reinterpret_cast<int8_t*>(out_vec_dev_buffer),
453  reinterpret_cast<const int8_t*>(&init_agg_vals[i]),
454  output_buffer_size_per_agg);
455  }
456  }
457  }
458  auto out_vec_dev_ptr = gpu_allocator_->alloc(agg_col_count * sizeof(int8_t*));
459  gpu_allocator_->copyToDevice(out_vec_dev_ptr,
460  reinterpret_cast<int8_t*>(out_vec_dev_buffers.data()),
461  agg_col_count * sizeof(int8_t*));
462  kernel_params[GROUPBY_BUF] = out_vec_dev_ptr;
463  std::vector<void*> param_ptrs;
464  for (auto& param : kernel_params) {
465  param_ptrs.push_back(&param);
466  }
467 
468  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
469  auto prepareTime = prepareClock->stop();
470 
471  VLOG(1) << "Device " << std::to_string(device_id)
472  << ": launchGpuCode: prepare: " << std::to_string(prepareTime) << " ms";
473  launchClock->start();
474  }
475 
476  if (hoist_literals) {
477  kernel->launch(grid_size_x,
478  grid_size_y,
479  grid_size_z,
480  block_size_x,
481  block_size_y,
482  block_size_z,
483  shared_memory_size,
484  &param_ptrs[0],
485  optimize_cuda_block_and_grid_sizes);
486  } else {
487  param_ptrs.erase(param_ptrs.begin() + LITERALS); // TODO(alex): remove
488  kernel->launch(grid_size_x,
489  grid_size_y,
490  grid_size_z,
491  block_size_x,
492  block_size_y,
493  block_size_z,
494  shared_memory_size,
495  &param_ptrs[0],
496  optimize_cuda_block_and_grid_sizes);
497  }
498 
499  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
500  auto launchTime = launchClock->stop();
501  VLOG(1) << "Device " << std::to_string(device_id)
502  << ": launchGpuCode: cuLaunchKernel: " << std::to_string(launchTime)
503  << " ms";
504  finishClock->start();
505  }
506 
507  gpu_allocator_->copyFromDevice(
508  &error_codes[0], err_desc, error_codes.size() * sizeof(error_codes[0]));
509  *error_code = aggregate_error_codes(error_codes);
510  if (*error_code > 0) {
511  return {};
512  }
513  if (ra_exe_unit.estimator) {
515  estimator_result_set_->syncEstimatorBuffer();
516  return {};
517  }
518  for (size_t i = 0; i < agg_col_count; ++i) {
519  int64_t* host_out_vec = new int64_t[output_buffer_size_per_agg];
520  gpu_allocator_->copyFromDevice(
521  host_out_vec, out_vec_dev_buffers[i], output_buffer_size_per_agg);
522  out_vec.push_back(host_out_vec);
523  }
524  }
525  const auto count_distinct_bitmap_mem = query_buffers_->getCountDistinctBitmapPtr();
526  if (count_distinct_bitmap_mem) {
527  gpu_allocator_->copyFromDevice(query_buffers_->getCountDistinctHostPtr(),
528  reinterpret_cast<void*>(count_distinct_bitmap_mem),
529  query_buffers_->getCountDistinctBitmapBytes());
530  }
531 
532  const auto varlen_output_gpu_buf = query_buffers_->getVarlenOutputPtr();
533  if (varlen_output_gpu_buf) {
535  const size_t varlen_output_buf_bytes =
538  CHECK(query_buffers_->getVarlenOutputHostPtr());
539  gpu_allocator_->copyFromDevice(query_buffers_->getVarlenOutputHostPtr(),
540  reinterpret_cast<void*>(varlen_output_gpu_buf),
541  varlen_output_buf_bytes);
542  }
543 
544  if (g_enable_dynamic_watchdog || (allow_runtime_interrupt && !render_allocator)) {
545  if (allow_runtime_interrupt) {
546  kernel->resetRuntimeInterrupter(device_id);
547  }
548  auto finishTime = finishClock->stop();
549  VLOG(1) << "Device " << std::to_string(device_id)
550  << ": launchGpuCode: finish: " << std::to_string(finishTime) << " ms";
551  }
552 
553  return out_vec;
554 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
RenderAllocator * getRenderAllocator(size_t device_id)
std::unique_ptr< DeviceAllocator > gpu_allocator_
size_t get_num_allocated_rows_from_gpu(DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id)
const std::list< Analyzer::OrderEntry > order_entries
bool use_speculative_top_n(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc)
const ExecutorDispatchMode dispatch_mode_
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:80
void inplace_sort_gpu(const std::list< Analyzer::OrderEntry > &order_entries, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &group_by_buffers, Data_Namespace::DataMgr *data_mgr, const int device_id)
std::string to_string(char const *&&v)
std::unique_ptr< QueryMemoryInitializer > query_buffers_
#define INJECT_TIMER(DESC)
Definition: measure.h:93
QueryDescriptionType getQueryDescriptionType() const
QueryMemoryDescriptor query_mem_desc_
std::optional< size_t > varlenOutputBufferElemSize() const
std::vector< int8_t * > prepareKernelParams(const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< int8_t > &literal_buff, const std::vector< std::vector< int64_t >> &num_rows, const std::vector< std::vector< uint64_t >> &frag_offsets, const int32_t scan_limit, const std::vector< int64_t > &init_agg_vals, const std::vector< int32_t > &error_codes, const uint32_t num_tables, const std::vector< int8_t * > &join_hash_tables, Data_Namespace::DataMgr *data_mgr, const int device_id, const bool hoist_literals, const bool is_group_by) const
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:411
int32_t aggregate_error_codes(const std::vector< int32_t > &error_codes)
std::unique_ptr< ResultSet > estimator_result_set_
unsigned g_dynamic_watchdog_time_limit
Definition: Execute.cpp:85
#define VLOG(n)
Definition: Logger.h:387

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< int8_t * > QueryExecutionContext::prepareKernelParams ( const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< int8_t > &  literal_buff,
const std::vector< std::vector< int64_t >> &  num_rows,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
const int32_t  scan_limit,
const std::vector< int64_t > &  init_agg_vals,
const std::vector< int32_t > &  error_codes,
const uint32_t  num_tables,
const std::vector< int8_t * > &  join_hash_tables,
Data_Namespace::DataMgr data_mgr,
const int  device_id,
const bool  hoist_literals,
const bool  is_group_by 
) const
private

Definition at line 750 of file QueryExecutionContext.cpp.

References align_to_int64(), CHECK, CHECK_EQ, COL_BUFFERS, compact_init_vals(), ERROR_CODE, FRAG_ROW_OFFSETS, QueryMemoryDescriptor::getColsSize(), gpu_allocator_, INIT_AGG_VALS, JOIN_HASH_TABLES, KERN_PARAM_COUNT, LITERALS, MAX_MATCHED, NUM_FRAGMENTS, NUM_ROWS, NUM_TABLES, output_columnar_, query_buffers_, query_mem_desc_, ROW_FUNC_MGR, and TOTAL_MATCHED.

Referenced by launchGpuCode().

763  {
765  std::vector<int8_t*> params(KERN_PARAM_COUNT, 0);
766  const uint64_t num_fragments = static_cast<uint64_t>(col_buffers.size());
767  const size_t col_count{num_fragments > 0 ? col_buffers.front().size() : 0};
768  if (col_count) {
769  std::vector<int8_t*> multifrag_col_dev_buffers;
770  for (auto frag_col_buffers : col_buffers) {
771  std::vector<const int8_t*> col_dev_buffers;
772  for (auto col_buffer : frag_col_buffers) {
773  col_dev_buffers.push_back((int8_t*)col_buffer);
774  }
775  auto col_buffers_dev_ptr = gpu_allocator_->alloc(col_count * sizeof(int8_t*));
776  gpu_allocator_->copyToDevice(
777  col_buffers_dev_ptr, &col_dev_buffers[0], col_count * sizeof(int8_t*));
778  multifrag_col_dev_buffers.push_back(col_buffers_dev_ptr);
779  }
780  params[COL_BUFFERS] = gpu_allocator_->alloc(num_fragments * sizeof(int8_t*));
781 
782  gpu_allocator_->copyToDevice(params[COL_BUFFERS],
783  &multifrag_col_dev_buffers[0],
784  num_fragments * sizeof(int8_t*));
785  }
786  params[NUM_FRAGMENTS] = gpu_allocator_->alloc(sizeof(uint64_t));
787  gpu_allocator_->copyToDevice(params[NUM_FRAGMENTS], &num_fragments, sizeof(uint64_t));
788 
789  int8_t* literals_and_addr_mapping =
790  gpu_allocator_->alloc(literal_buff.size() + 2 * sizeof(int64_t));
791  CHECK_EQ(0, (int64_t)literals_and_addr_mapping % 8);
792  std::vector<int64_t> additional_literal_bytes;
793  const auto count_distinct_bitmap_mem = query_buffers_->getCountDistinctBitmapPtr();
794  if (count_distinct_bitmap_mem) {
795  // Store host and device addresses
796  const auto count_distinct_bitmap_host_mem = query_buffers_->getCountDistinctHostPtr();
797  CHECK(count_distinct_bitmap_host_mem);
798  additional_literal_bytes.push_back(
799  reinterpret_cast<int64_t>(count_distinct_bitmap_host_mem));
800  additional_literal_bytes.push_back(static_cast<int64_t>(count_distinct_bitmap_mem));
801  gpu_allocator_->copyToDevice(
802  literals_and_addr_mapping,
803  &additional_literal_bytes[0],
804  additional_literal_bytes.size() * sizeof(additional_literal_bytes[0]));
805  }
806  params[LITERALS] = literals_and_addr_mapping + additional_literal_bytes.size() *
807  sizeof(additional_literal_bytes[0]);
808  if (!literal_buff.empty()) {
809  CHECK(hoist_literals);
810  gpu_allocator_->copyToDevice(params[LITERALS], &literal_buff[0], literal_buff.size());
811  }
812  CHECK_EQ(num_rows.size(), col_buffers.size());
813  std::vector<int64_t> flatened_num_rows;
814  for (auto& nums : num_rows) {
815  CHECK_EQ(nums.size(), num_tables);
816  flatened_num_rows.insert(flatened_num_rows.end(), nums.begin(), nums.end());
817  }
818  params[NUM_ROWS] = gpu_allocator_->alloc(sizeof(int64_t) * flatened_num_rows.size());
819  gpu_allocator_->copyToDevice(params[NUM_ROWS],
820  &flatened_num_rows[0],
821  sizeof(int64_t) * flatened_num_rows.size());
822 
823  CHECK_EQ(frag_offsets.size(), col_buffers.size());
824  std::vector<int64_t> flatened_frag_offsets;
825  for (auto& offsets : frag_offsets) {
826  CHECK_EQ(offsets.size(), num_tables);
827  flatened_frag_offsets.insert(
828  flatened_frag_offsets.end(), offsets.begin(), offsets.end());
829  }
830  params[FRAG_ROW_OFFSETS] =
831  gpu_allocator_->alloc(sizeof(int64_t) * flatened_frag_offsets.size());
832  gpu_allocator_->copyToDevice(params[FRAG_ROW_OFFSETS],
833  &flatened_frag_offsets[0],
834  sizeof(int64_t) * flatened_num_rows.size());
835 
836  // Note that this will be overwritten if we are setting the entry count during group by
837  // buffer allocation and initialization
838  int32_t max_matched{scan_limit};
839  params[MAX_MATCHED] = gpu_allocator_->alloc(sizeof(max_matched));
840  gpu_allocator_->copyToDevice(params[MAX_MATCHED], &max_matched, sizeof(max_matched));
841 
842  int32_t total_matched{0};
843  params[TOTAL_MATCHED] = gpu_allocator_->alloc(sizeof(total_matched));
844  gpu_allocator_->copyToDevice(
845  params[TOTAL_MATCHED], &total_matched, sizeof(total_matched));
846 
847  if (is_group_by && !output_columnar_) {
848  auto cmpt_sz = align_to_int64(query_mem_desc_.getColsSize()) / sizeof(int64_t);
849  auto cmpt_val_buff = compact_init_vals(cmpt_sz, init_agg_vals, query_mem_desc_);
850  params[INIT_AGG_VALS] = gpu_allocator_->alloc(cmpt_sz * sizeof(int64_t));
851  gpu_allocator_->copyToDevice(
852  params[INIT_AGG_VALS], &cmpt_val_buff[0], cmpt_sz * sizeof(int64_t));
853  } else {
854  params[INIT_AGG_VALS] = gpu_allocator_->alloc(init_agg_vals.size() * sizeof(int64_t));
855  gpu_allocator_->copyToDevice(
856  params[INIT_AGG_VALS], &init_agg_vals[0], init_agg_vals.size() * sizeof(int64_t));
857  }
858 
859  params[ERROR_CODE] = gpu_allocator_->alloc(error_codes.size() * sizeof(error_codes[0]));
860  gpu_allocator_->copyToDevice(
861  params[ERROR_CODE], &error_codes[0], error_codes.size() * sizeof(error_codes[0]));
862 
863  params[NUM_TABLES] = gpu_allocator_->alloc(sizeof(uint32_t));
864  gpu_allocator_->copyToDevice(params[NUM_TABLES], &num_tables, sizeof(uint32_t));
865 
866  const auto hash_table_count = join_hash_tables.size();
867  switch (hash_table_count) {
868  case 0: {
869  params[JOIN_HASH_TABLES] = 0;
870  break;
871  }
872  case 1:
873  params[JOIN_HASH_TABLES] = join_hash_tables[0];
874  break;
875  default: {
876  params[JOIN_HASH_TABLES] =
877  gpu_allocator_->alloc(hash_table_count * sizeof(int64_t));
878  gpu_allocator_->copyToDevice(params[JOIN_HASH_TABLES],
879  &join_hash_tables[0],
880  hash_table_count * sizeof(int64_t));
881  break;
882  }
883  }
884 
885  // RowFunctionManager is not supported in GPU. We just keep the argument
886  // to avoid diverging from CPU generated code
887  params[ROW_FUNC_MGR] = nullptr;
888 
889  return params;
890 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::unique_ptr< DeviceAllocator > gpu_allocator_
std::unique_ptr< QueryMemoryInitializer > query_buffers_
std::vector< int64_t > compact_init_vals(const size_t cmpt_size, const std::vector< int64_t > &init_vec, const QueryMemoryDescriptor &query_mem_desc)
QueryMemoryDescriptor query_mem_desc_
#define CHECK(condition)
Definition: Logger.h:291
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 147 of file QueryExecutionContext.h.

Member Data Documentation

const ExecutorDeviceType QueryExecutionContext::device_type_
private

Definition at line 140 of file QueryExecutionContext.h.

Referenced by getRowSet(), and groupBufferToResults().

const ExecutorDispatchMode QueryExecutionContext::dispatch_mode_
private

Definition at line 141 of file QueryExecutionContext.h.

Referenced by launchGpuCode().

std::unique_ptr<ResultSet> QueryExecutionContext::estimator_result_set_
mutableprivate
const Executor* QueryExecutionContext::executor_
private
std::unique_ptr<DeviceAllocator> QueryExecutionContext::gpu_allocator_
private
const bool QueryExecutionContext::output_columnar_
private
std::shared_ptr<RowSetMemoryOwner> QueryExecutionContext::row_set_mem_owner_
private

Definition at line 142 of file QueryExecutionContext.h.

Referenced by getRowSet(), and groupBufferToDeinterleavedResults().


The documentation for this class was generated from the following files: