OmniSciDB  06b3bd477c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryFragmentDescriptor Class Reference

#include <QueryFragmentDescriptor.h>

Public Member Functions

 QueryFragmentDescriptor (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const std::vector< Data_Namespace::MemoryInfo > &gpu_mem_infos, const double gpu_input_mem_limit_percent, const std::vector< size_t > allowed_outer_fragment_indices)
 
void buildFragmentKernelMap (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const ExecutorDeviceType &device_type, const bool enable_multifrag_kernels, const bool enable_inner_join_fragment_skipping, Executor *executor)
 
template<typename DISPATCH_FCN >
void assignFragsToMultiDispatch (DISPATCH_FCN f) const
 
template<typename DISPATCH_FCN >
void assignFragsToKernelDispatch (DISPATCH_FCN f, const RelAlgExecutionUnit &ra_exe_unit) const
 
bool shouldCheckWorkUnitWatchdog () const
 

Static Public Member Functions

static void computeAllTablesFragments (std::map< int, const TableFragments * > &all_tables_fragments, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos)
 

Protected Member Functions

void buildFragmentPerKernelMapForUnion (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
 
void buildFragmentPerKernelMap (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
 
void buildMultifragKernelMap (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, const bool enable_inner_join_fragment_skipping, Executor *executor)
 
void buildFragmentPerKernelForTable (const TableFragments *fragments, const RelAlgExecutionUnit &ra_exe_unit, const InputDescriptor &table_desc, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ChunkMetadataVector &deleted_chunk_metadata_vec, const std::optional< size_t > table_desc_offset, const ExecutorDeviceType &device_type, Executor *executor)
 
bool terminateDispatchMaybe (size_t &tuple_count, const RelAlgExecutionUnit &ra_exe_unit, const ExecutionKernelDescriptor &kernel) const
 
void checkDeviceMemoryUsage (const Fragmenter_Namespace::FragmentInfo &fragment, const int device_id, const size_t num_cols)
 

Protected Attributes

std::vector< size_t > allowed_outer_fragment_indices_
 
size_t outer_fragments_size_ = 0
 
int64_t rowid_lookup_key_ = -1
 
std::map< int, const
TableFragments * > 
selected_tables_fragments_
 
std::map< int, std::vector
< ExecutionKernelDescriptor > > 
execution_kernels_per_device_
 
double gpu_input_mem_limit_percent_
 
std::map< size_t, size_t > tuple_count_per_device_
 
std::map< size_t, size_t > available_gpu_mem_bytes_
 

Detailed Description

Definition at line 67 of file QueryFragmentDescriptor.h.

Constructor & Destructor Documentation

QueryFragmentDescriptor::QueryFragmentDescriptor ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const std::vector< Data_Namespace::MemoryInfo > &  gpu_mem_infos,
const double  gpu_input_mem_limit_percent,
const std::vector< size_t >  allowed_outer_fragment_indices 
)

Definition at line 24 of file QueryFragmentDescriptor.cpp.

References available_gpu_mem_bytes_, CHECK_EQ, RelAlgExecutionUnit::input_descs, and selected_tables_fragments_.

30  : allowed_outer_fragment_indices_(allowed_outer_fragment_indices)
31  , gpu_input_mem_limit_percent_(gpu_input_mem_limit_percent) {
32  const size_t input_desc_count{ra_exe_unit.input_descs.size()};
33  CHECK_EQ(query_infos.size(), input_desc_count);
34  for (size_t table_idx = 0; table_idx < input_desc_count; ++table_idx) {
35  const auto table_id = ra_exe_unit.input_descs[table_idx].getTableId();
36  if (!selected_tables_fragments_.count(table_id)) {
37  selected_tables_fragments_[ra_exe_unit.input_descs[table_idx].getTableId()] =
38  &query_infos[table_idx].info.fragments;
39  }
40  }
41 
42  for (size_t device_id = 0; device_id < gpu_mem_infos.size(); device_id++) {
43  const auto& gpu_mem_info = gpu_mem_infos[device_id];
44  available_gpu_mem_bytes_[device_id] =
45  gpu_mem_info.maxNumPages * gpu_mem_info.pageSize;
46  }
47 }
std::map< int, const TableFragments * > selected_tables_fragments_
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< InputDescriptor > input_descs
std::vector< size_t > allowed_outer_fragment_indices_
std::map< size_t, size_t > available_gpu_mem_bytes_

Member Function Documentation

template<typename DISPATCH_FCN >
void QueryFragmentDescriptor::assignFragsToKernelDispatch ( DISPATCH_FCN  f,
const RelAlgExecutionUnit ra_exe_unit 
) const
inline

Dispatch one fragment for each device. Iterate the device map and dispatch one kernel for each device per iteration. This allows balanced dispatch as well as early termination if the number of rows passing the kernel can be computed at dispatch time and the scan limit is reached.

Definition at line 110 of file QueryFragmentDescriptor.h.

References CHECK(), execution_kernels_per_device_, rowid_lookup_key_, and terminateDispatchMaybe().

111  {
112  if (execution_kernels_per_device_.empty()) {
113  return;
114  }
115 
116  size_t tuple_count = 0;
117 
118  std::unordered_map<int, size_t> execution_kernel_index;
119  for (const auto& device_itr : execution_kernels_per_device_) {
120  CHECK(execution_kernel_index.insert(std::make_pair(device_itr.first, size_t(0)))
121  .second);
122  }
123 
124  bool dispatch_finished = false;
125  while (!dispatch_finished) {
126  dispatch_finished = true;
127  for (const auto& device_itr : execution_kernels_per_device_) {
128  auto& kernel_idx = execution_kernel_index[device_itr.first];
129  if (kernel_idx < device_itr.second.size()) {
130  dispatch_finished = false;
131  const auto& execution_kernel = device_itr.second[kernel_idx++];
132  f(device_itr.first, execution_kernel.fragments, rowid_lookup_key_);
133  if (terminateDispatchMaybe(tuple_count, ra_exe_unit, execution_kernel)) {
134  return;
135  }
136  }
137  }
138  }
139  }
bool terminateDispatchMaybe(size_t &tuple_count, const RelAlgExecutionUnit &ra_exe_unit, const ExecutionKernelDescriptor &kernel) const
CHECK(cgen_state)
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_

+ Here is the call graph for this function:

template<typename DISPATCH_FCN >
void QueryFragmentDescriptor::assignFragsToMultiDispatch ( DISPATCH_FCN  f) const
inline

Dispatch multi-fragment kernels. Currently GPU only. Each GPU should have only one kernel, with multiple fragments in its fragments list.

Definition at line 93 of file QueryFragmentDescriptor.h.

References CHECK_EQ, execution_kernels_per_device_, and rowid_lookup_key_.

93  {
94  for (const auto& device_itr : execution_kernels_per_device_) {
95  const auto& execution_kernels = device_itr.second;
96  CHECK_EQ(execution_kernels.size(), size_t(1));
97 
98  const auto& fragments_list = execution_kernels.front().fragments;
99  f(device_itr.first, fragments_list, rowid_lookup_key_);
100  }
101  }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
void QueryFragmentDescriptor::buildFragmentKernelMap ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< uint64_t > &  frag_offsets,
const int  device_count,
const ExecutorDeviceType device_type,
const bool  enable_multifrag_kernels,
const bool  enable_inner_join_fragment_skipping,
Executor executor 
)

Definition at line 63 of file QueryFragmentDescriptor.cpp.

References buildFragmentPerKernelMap(), buildFragmentPerKernelMapForUnion(), buildMultifragKernelMap(), RelAlgExecutionUnit::input_descs, and RelAlgExecutionUnit::union_all.

70  {
71  // For joins, only consider the cardinality of the LHS
72  // columns in the bytes per row count.
73  std::set<int> lhs_table_ids;
74  for (const auto& input_desc : ra_exe_unit.input_descs) {
75  if (input_desc.getNestLevel() == 0) {
76  lhs_table_ids.insert(input_desc.getTableId());
77  }
78  }
79 
80  const auto num_bytes_for_row = executor->getNumBytesForFetchedRow(lhs_table_ids);
81 
82  if (ra_exe_unit.union_all) {
84  frag_offsets,
85  device_count,
86  num_bytes_for_row,
87  device_type,
88  executor);
89  } else if (enable_multifrag_kernels) {
90  buildMultifragKernelMap(ra_exe_unit,
91  frag_offsets,
92  device_count,
93  num_bytes_for_row,
94  device_type,
95  enable_inner_join_fragment_skipping,
96  executor);
97  } else {
98  buildFragmentPerKernelMap(ra_exe_unit,
99  frag_offsets,
100  device_count,
101  num_bytes_for_row,
102  device_type,
103  executor);
104  }
105 }
const std::optional< bool > union_all
std::vector< InputDescriptor > input_descs
void buildFragmentPerKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)
void buildMultifragKernelMap(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, const bool enable_inner_join_fragment_skipping, Executor *executor)
void buildFragmentPerKernelMapForUnion(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ExecutorDeviceType &device_type, Executor *executor)

+ Here is the call graph for this function:

void QueryFragmentDescriptor::buildFragmentPerKernelForTable ( const TableFragments fragments,
const RelAlgExecutionUnit ra_exe_unit,
const InputDescriptor table_desc,
const std::vector< uint64_t > &  frag_offsets,
const int  device_count,
const size_t  num_bytes_for_row,
const ChunkMetadataVector deleted_chunk_metadata_vec,
const std::optional< size_t >  table_desc_offset,
const ExecutorDeviceType device_type,
Executor executor 
)
protected

Definition at line 107 of file QueryFragmentDescriptor.cpp.

References allowed_outer_fragment_indices_, CHECK(), CHECK_GE, CHECK_GT, checkDeviceMemoryUsage(), CPU, Data_Namespace::CPU_LEVEL, execution_kernels_per_device_, GPU, Data_Namespace::GPU_LEVEL, RelAlgExecutionUnit::input_descs, rowid_lookup_key_, selected_tables_fragments_, and RelAlgExecutionUnit::simple_quals.

Referenced by buildFragmentPerKernelMap(), and buildFragmentPerKernelMapForUnion().

117  {
118  auto get_fragment_tuple_count =
119  [&deleted_chunk_metadata_vec](const auto& fragment) -> std::optional<size_t> {
120  // returning std::nullopt disables execution dispatch optimizations based on tuple
121  // counts as it signals to the dispatch mechanism that a reliable tuple count cannot
122  // be obtained. This is the case for fragments which have deleted rows, temporary
123  // table fragments, or fragments in a UNION query.
124  if (deleted_chunk_metadata_vec.empty()) {
125  return std::nullopt;
126  }
127  const auto fragment_id = fragment.fragmentId;
128  CHECK_GE(fragment_id, 0);
129  if (static_cast<size_t>(fragment_id) < deleted_chunk_metadata_vec.size()) {
130  const auto& chunk_metadata = deleted_chunk_metadata_vec[fragment_id];
131  if (chunk_metadata.second->chunkStats.max.tinyintval == 1) {
132  return std::nullopt;
133  }
134  }
135  return fragment.getNumTuples();
136  };
137 
138  for (size_t i = 0; i < fragments->size(); i++) {
139  if (!allowed_outer_fragment_indices_.empty()) {
140  if (std::find(allowed_outer_fragment_indices_.begin(),
142  i) == allowed_outer_fragment_indices_.end()) {
143  continue;
144  }
145  }
146 
147  const auto& fragment = (*fragments)[i];
148  const auto skip_frag = executor->skipFragment(
149  table_desc, fragment, ra_exe_unit.simple_quals, frag_offsets, i);
150  if (skip_frag.first) {
151  continue;
152  }
153  rowid_lookup_key_ = std::max(rowid_lookup_key_, skip_frag.second);
154  const int chosen_device_count =
155  device_type == ExecutorDeviceType::CPU ? 1 : device_count;
156  CHECK_GT(chosen_device_count, 0);
157  const auto memory_level = device_type == ExecutorDeviceType::GPU
160  const int device_id = (device_type == ExecutorDeviceType::CPU || fragment.shard == -1)
161  ? fragment.deviceIds[static_cast<int>(memory_level)]
162  : fragment.shard % chosen_device_count;
163 
164  if (device_type == ExecutorDeviceType::GPU) {
165  checkDeviceMemoryUsage(fragment, device_id, num_bytes_for_row);
166  }
167 
168  ExecutionKernelDescriptor execution_kernel_desc{
169  device_id, {}, get_fragment_tuple_count(fragment)};
170  if (table_desc_offset) {
171  const auto frag_ids =
172  executor->getTableFragmentIndices(ra_exe_unit,
173  device_type,
174  *table_desc_offset,
175  i,
177  executor->getInnerTabIdToJoinCond());
178  const auto table_id = ra_exe_unit.input_descs[*table_desc_offset].getTableId();
179  execution_kernel_desc.fragments.emplace_back(FragmentsPerTable{table_id, frag_ids});
180 
181  } else {
182  for (size_t j = 0; j < ra_exe_unit.input_descs.size(); ++j) {
183  const auto frag_ids =
184  executor->getTableFragmentIndices(ra_exe_unit,
185  device_type,
186  j,
187  i,
189  executor->getInnerTabIdToJoinCond());
190  const auto table_id = ra_exe_unit.input_descs[j].getTableId();
191  auto table_frags_it = selected_tables_fragments_.find(table_id);
192  CHECK(table_frags_it != selected_tables_fragments_.end());
193 
194  execution_kernel_desc.fragments.emplace_back(
195  FragmentsPerTable{table_id, frag_ids});
196  }
197  }
198 
199  auto itr = execution_kernels_per_device_.find(device_id);
200  if (itr == execution_kernels_per_device_.end()) {
201  auto const pair = execution_kernels_per_device_.insert(std::make_pair(
202  device_id,
203  std::vector<ExecutionKernelDescriptor>{std::move(execution_kernel_desc)}));
204  CHECK(pair.second);
205  } else {
206  itr->second.emplace_back(std::move(execution_kernel_desc));
207  }
208  }
209 }
std::map< int, const TableFragments * > selected_tables_fragments_
std::vector< InputDescriptor > input_descs
#define CHECK_GE(x, y)
Definition: Logger.h:210
#define CHECK_GT(x, y)
Definition: Logger.h:209
CHECK(cgen_state)
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
void checkDeviceMemoryUsage(const Fragmenter_Namespace::FragmentInfo &fragment, const int device_id, const size_t num_cols)
std::vector< size_t > allowed_outer_fragment_indices_
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryFragmentDescriptor::buildFragmentPerKernelMap ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< uint64_t > &  frag_offsets,
const int  device_count,
const size_t  num_bytes_for_row,
const ExecutorDeviceType device_type,
Executor executor 
)
protected

Definition at line 266 of file QueryFragmentDescriptor.cpp.

References buildFragmentPerKernelForTable(), CHECK(), RelAlgExecutionUnit::input_descs, outer_fragments_size_, and selected_tables_fragments_.

Referenced by buildFragmentKernelMap().

272  {
273  const auto& outer_table_desc = ra_exe_unit.input_descs.front();
274  const int outer_table_id = outer_table_desc.getTableId();
275  auto it = selected_tables_fragments_.find(outer_table_id);
276  CHECK(it != selected_tables_fragments_.end());
277  const auto outer_fragments = it->second;
278  outer_fragments_size_ = outer_fragments->size();
279 
280  const auto& catalog = executor->getCatalog();
281 
282  auto& data_mgr = catalog->getDataMgr();
283  ChunkMetadataVector deleted_chunk_metadata_vec;
284 
285  if (outer_table_id > 0) {
286  // Temporary tables will not have a table descriptor and not have deleted rows.
287  const auto td = catalog->getMetadataForTable(outer_table_id);
288  CHECK(td);
289  const auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
290  if (deleted_cd) {
291  ChunkKey chunk_key_prefix = {
292  catalog->getCurrentDB().dbId, outer_table_id, deleted_cd->columnId};
293  data_mgr.getChunkMetadataVecForKeyPrefix(deleted_chunk_metadata_vec,
294  chunk_key_prefix);
295  }
296  }
297 
298  buildFragmentPerKernelForTable(outer_fragments,
299  ra_exe_unit,
300  outer_table_desc,
301  frag_offsets,
302  device_count,
303  num_bytes_for_row,
304  deleted_chunk_metadata_vec,
305  std::nullopt,
306  device_type,
307  executor);
308 }
std::map< int, const TableFragments * > selected_tables_fragments_
std::vector< int > ChunkKey
Definition: types.h:35
std::vector< InputDescriptor > input_descs
CHECK(cgen_state)
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
void buildFragmentPerKernelForTable(const TableFragments *fragments, const RelAlgExecutionUnit &ra_exe_unit, const InputDescriptor &table_desc, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ChunkMetadataVector &deleted_chunk_metadata_vec, const std::optional< size_t > table_desc_offset, const ExecutorDeviceType &device_type, Executor *executor)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryFragmentDescriptor::buildFragmentPerKernelMapForUnion ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< uint64_t > &  frag_offsets,
const int  device_count,
const size_t  num_bytes_for_row,
const ExecutorDeviceType device_type,
Executor executor 
)
protected

Definition at line 211 of file QueryFragmentDescriptor.cpp.

References buildFragmentPerKernelForTable(), CHECK(), execution_kernels_per_device_, RelAlgExecutionUnit::input_descs, shared::printContainer(), selected_tables_fragments_, and VLOG.

Referenced by buildFragmentKernelMap().

217  {
218  const auto& catalog = executor->getCatalog();
219 
220  for (size_t j = 0; j < ra_exe_unit.input_descs.size(); ++j) {
221  auto const& table_desc = ra_exe_unit.input_descs[j];
222  int const table_id = table_desc.getTableId();
223  TableFragments const* fragments = selected_tables_fragments_.at(table_id);
224 
225  auto& data_mgr = catalog->getDataMgr();
226  ChunkMetadataVector deleted_chunk_metadata_vec;
227  if (table_id > 0) {
228  // Temporary tables will not have a table descriptor and not have deleted rows.
229  const auto td = catalog->getMetadataForTable(table_id);
230  CHECK(td);
231  auto deleted_cd = catalog->getDeletedColumnIfRowsDeleted(td);
232  if (deleted_cd) {
233  ChunkKey chunk_key_prefix = {
234  catalog->getCurrentDB().dbId, table_id, deleted_cd->columnId};
235  data_mgr.getChunkMetadataVecForKeyPrefix(deleted_chunk_metadata_vec,
236  chunk_key_prefix);
237  }
238  }
239 
241  ra_exe_unit,
242  table_desc,
243  frag_offsets,
244  device_count,
245  num_bytes_for_row,
246  {},
247  j,
248  device_type,
249  executor);
250 
251  std::vector<int> table_ids =
252  std::accumulate(execution_kernels_per_device_[0].begin(),
254  std::vector<int>(),
255  [](auto&& vec, auto& exe_kern) {
256  vec.push_back(exe_kern.fragments[0].table_id);
257  return vec;
258  });
259  VLOG(1) << "execution_kernels_per_device_.size()="
261  << " execution_kernels_per_device_[0][*].fragments[0].table_id="
262  << shared::printContainer(table_ids);
263  }
264 }
std::map< int, const TableFragments * > selected_tables_fragments_
std::vector< int > ChunkKey
Definition: types.h:35
std::vector< Fragmenter_Namespace::FragmentInfo > TableFragments
std::vector< InputDescriptor > input_descs
CHECK(cgen_state)
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
std::vector< std::pair< ChunkKey, std::shared_ptr< ChunkMetadata >>> ChunkMetadataVector
PrintContainer< CONTAINER > printContainer(CONTAINER &container)
Definition: misc.h:63
void buildFragmentPerKernelForTable(const TableFragments *fragments, const RelAlgExecutionUnit &ra_exe_unit, const InputDescriptor &table_desc, const std::vector< uint64_t > &frag_offsets, const int device_count, const size_t num_bytes_for_row, const ChunkMetadataVector &deleted_chunk_metadata_vec, const std::optional< size_t > table_desc_offset, const ExecutorDeviceType &device_type, Executor *executor)
#define VLOG(n)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryFragmentDescriptor::buildMultifragKernelMap ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< uint64_t > &  frag_offsets,
const int  device_count,
const size_t  num_bytes_for_row,
const ExecutorDeviceType device_type,
const bool  enable_inner_join_fragment_skipping,
Executor executor 
)
protected

Definition at line 310 of file QueryFragmentDescriptor.cpp.

References allowed_outer_fragment_indices_, CHECK(), CHECK_EQ, checkDeviceMemoryUsage(), execution_kernels_per_device_, GPU, Data_Namespace::GPU_LEVEL, RelAlgExecutionUnit::input_descs, outer_fragments_size_, rowid_lookup_key_, selected_tables_fragments_, and RelAlgExecutionUnit::simple_quals.

Referenced by buildFragmentKernelMap().

317  {
318  // Allocate all the fragments of the tables involved in the query to available
319  // devices. The basic idea: the device is decided by the outer table in the
320  // query (the first table in a join) and we need to broadcast the fragments
321  // in the inner table to each device. Sharding will change this model.
322  const auto& outer_table_desc = ra_exe_unit.input_descs.front();
323  const int outer_table_id = outer_table_desc.getTableId();
324  auto it = selected_tables_fragments_.find(outer_table_id);
325  CHECK(it != selected_tables_fragments_.end());
326  const auto outer_fragments = it->second;
327  outer_fragments_size_ = outer_fragments->size();
328 
329  const auto inner_table_id_to_join_condition = executor->getInnerTabIdToJoinCond();
330 
331  for (size_t outer_frag_id = 0; outer_frag_id < outer_fragments->size();
332  ++outer_frag_id) {
333  if (!allowed_outer_fragment_indices_.empty()) {
334  if (std::find(allowed_outer_fragment_indices_.begin(),
336  outer_frag_id) == allowed_outer_fragment_indices_.end()) {
337  continue;
338  }
339  }
340 
341  const auto& fragment = (*outer_fragments)[outer_frag_id];
342  auto skip_frag = executor->skipFragment(outer_table_desc,
343  fragment,
344  ra_exe_unit.simple_quals,
345  frag_offsets,
346  outer_frag_id);
347  if (enable_inner_join_fragment_skipping &&
348  (skip_frag == std::pair<bool, int64_t>(false, -1))) {
349  skip_frag = executor->skipFragmentInnerJoins(
350  outer_table_desc, ra_exe_unit, fragment, frag_offsets, outer_frag_id);
351  }
352  if (skip_frag.first) {
353  continue;
354  }
355  const int device_id =
356  fragment.shard == -1
357  ? fragment.deviceIds[static_cast<int>(Data_Namespace::GPU_LEVEL)]
358  : fragment.shard % device_count;
359  if (device_type == ExecutorDeviceType::GPU) {
360  checkDeviceMemoryUsage(fragment, device_id, num_bytes_for_row);
361  }
362  for (size_t j = 0; j < ra_exe_unit.input_descs.size(); ++j) {
363  const auto table_id = ra_exe_unit.input_descs[j].getTableId();
364  auto table_frags_it = selected_tables_fragments_.find(table_id);
365  CHECK(table_frags_it != selected_tables_fragments_.end());
366  const auto frag_ids =
367  executor->getTableFragmentIndices(ra_exe_unit,
368  device_type,
369  j,
370  outer_frag_id,
372  inner_table_id_to_join_condition);
373 
374  if (execution_kernels_per_device_.find(device_id) ==
376  std::vector<ExecutionKernelDescriptor> kernel_descs{
377  ExecutionKernelDescriptor{device_id, FragmentsList{}, std::nullopt}};
378  CHECK(
379  execution_kernels_per_device_.insert(std::make_pair(device_id, kernel_descs))
380  .second);
381  }
382 
383  // Multifrag kernels only have one execution kernel per device. Grab the execution
384  // kernel object and push back into its fragments list.
385  CHECK_EQ(execution_kernels_per_device_[device_id].size(), size_t(1));
386  auto& execution_kernel = execution_kernels_per_device_[device_id].front();
387 
388  auto& kernel_frag_list = execution_kernel.fragments;
389  if (kernel_frag_list.size() < j + 1) {
390  kernel_frag_list.emplace_back(FragmentsPerTable{table_id, frag_ids});
391  } else {
392  CHECK_EQ(kernel_frag_list[j].table_id, table_id);
393  auto& curr_frag_ids = kernel_frag_list[j].fragment_ids;
394  for (const int frag_id : frag_ids) {
395  if (std::find(curr_frag_ids.begin(), curr_frag_ids.end(), frag_id) ==
396  curr_frag_ids.end()) {
397  curr_frag_ids.push_back(frag_id);
398  }
399  }
400  }
401  }
402  rowid_lookup_key_ = std::max(rowid_lookup_key_, skip_frag.second);
403  }
404 }
std::map< int, const TableFragments * > selected_tables_fragments_
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< InputDescriptor > input_descs
std::vector< FragmentsPerTable > FragmentsList
CHECK(cgen_state)
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
void checkDeviceMemoryUsage(const Fragmenter_Namespace::FragmentInfo &fragment, const int device_id, const size_t num_cols)
std::vector< size_t > allowed_outer_fragment_indices_
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryFragmentDescriptor::checkDeviceMemoryUsage ( const Fragmenter_Namespace::FragmentInfo fragment,
const int  device_id,
const size_t  num_cols 
)
protected

Definition at line 440 of file QueryFragmentDescriptor.cpp.

References available_gpu_mem_bytes_, CHECK_GE, g_cluster, Fragmenter_Namespace::FragmentInfo::getNumTuples(), gpu_input_mem_limit_percent_, LOG, tuple_count_per_device_, and logger::WARNING.

Referenced by buildFragmentPerKernelForTable(), and buildMultifragKernelMap().

443  {
444  if (g_cluster) {
445  // Disabled in distributed mode for now
446  return;
447  }
448  CHECK_GE(device_id, 0);
449  tuple_count_per_device_[device_id] += fragment.getNumTuples();
450  const size_t gpu_bytes_limit =
452  if (tuple_count_per_device_[device_id] * num_bytes_for_row > gpu_bytes_limit) {
453  LOG(WARNING) << "Not enough memory on device " << device_id
454  << " for input chunks totaling "
455  << tuple_count_per_device_[device_id] * num_bytes_for_row
456  << " bytes (available device memory: " << gpu_bytes_limit << " bytes)";
457  throw QueryMustRunOnCpu();
458  }
459 }
std::map< size_t, size_t > tuple_count_per_device_
#define LOG(tag)
Definition: Logger.h:188
#define CHECK_GE(x, y)
Definition: Logger.h:210
bool g_cluster
std::map< size_t, size_t > available_gpu_mem_bytes_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryFragmentDescriptor::computeAllTablesFragments ( std::map< int, const TableFragments * > &  all_tables_fragments,
const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos 
)
static

Definition at line 49 of file QueryFragmentDescriptor.cpp.

References CHECK_EQ, and RelAlgExecutionUnit::input_descs.

Referenced by ExecutionKernel::runImpl().

52  {
53  for (size_t tab_idx = 0; tab_idx < ra_exe_unit.input_descs.size(); ++tab_idx) {
54  int table_id = ra_exe_unit.input_descs[tab_idx].getTableId();
55  CHECK_EQ(query_infos[tab_idx].table_id, table_id);
56  const auto& fragments = query_infos[tab_idx].info.fragments;
57  if (!all_tables_fragments.count(table_id)) {
58  all_tables_fragments.insert(std::make_pair(table_id, &fragments));
59  }
60  }
61 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< InputDescriptor > input_descs

+ Here is the caller graph for this function:

bool QueryFragmentDescriptor::shouldCheckWorkUnitWatchdog ( ) const
inline

Definition at line 141 of file QueryFragmentDescriptor.h.

References execution_kernels_per_device_, and rowid_lookup_key_.

141  {
142  return rowid_lookup_key_ < 0 && !execution_kernels_per_device_.empty();
143  }
std::map< int, std::vector< ExecutionKernelDescriptor > > execution_kernels_per_device_
bool QueryFragmentDescriptor::terminateDispatchMaybe ( size_t &  tuple_count,
const RelAlgExecutionUnit ra_exe_unit,
const ExecutionKernelDescriptor kernel 
) const
protected

Definition at line 422 of file QueryFragmentDescriptor.cpp.

References anonymous_namespace{QueryFragmentDescriptor.cpp}::is_sample_query(), SortInfo::limit, SortInfo::offset, ExecutionKernelDescriptor::outer_tuple_count, and RelAlgExecutionUnit::sort_info.

Referenced by assignFragsToKernelDispatch().

425  {
426  const auto sample_query_limit =
427  ra_exe_unit.sort_info.limit + ra_exe_unit.sort_info.offset;
428  if (!kernel.outer_tuple_count) {
429  return false;
430  } else {
431  tuple_count += *kernel.outer_tuple_count;
432  if (is_sample_query(ra_exe_unit) && sample_query_limit > 0 &&
433  tuple_count >= sample_query_limit) {
434  return true;
435  }
436  }
437  return false;
438 }
std::optional< size_t > outer_tuple_count
const size_t limit
const SortInfo sort_info
bool is_sample_query(const RelAlgExecutionUnit &ra_exe_unit)
const size_t offset

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Member Data Documentation

std::vector<size_t> QueryFragmentDescriptor::allowed_outer_fragment_indices_
protected
std::map<size_t, size_t> QueryFragmentDescriptor::available_gpu_mem_bytes_
protected

Definition at line 156 of file QueryFragmentDescriptor.h.

Referenced by checkDeviceMemoryUsage(), and QueryFragmentDescriptor().

std::map<int, std::vector<ExecutionKernelDescriptor> > QueryFragmentDescriptor::execution_kernels_per_device_
protected
double QueryFragmentDescriptor::gpu_input_mem_limit_percent_
protected

Definition at line 154 of file QueryFragmentDescriptor.h.

Referenced by checkDeviceMemoryUsage().

size_t QueryFragmentDescriptor::outer_fragments_size_ = 0
protected
int64_t QueryFragmentDescriptor::rowid_lookup_key_ = -1
protected
std::map<int, const TableFragments*> QueryFragmentDescriptor::selected_tables_fragments_
protected
std::map<size_t, size_t> QueryFragmentDescriptor::tuple_count_per_device_
protected

Definition at line 155 of file QueryFragmentDescriptor.h.

Referenced by checkDeviceMemoryUsage().


The documentation for this class was generated from the following files: