OmniSciDB  06b3bd477c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TableOptimizer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TableOptimizer.h"
18 
19 #include "Analyzer/Analyzer.h"
20 #include "QueryEngine/Execute.h"
21 #include "Shared/Logger.h"
22 #include "Shared/scope.h"
23 
25  Executor* executor,
27  : td_(td), executor_(executor), cat_(cat) {
28  CHECK(td);
29 }
30 namespace {
31 
32 template <typename T>
34  const auto stv = boost::get<ScalarTargetValue>(&tv);
35  CHECK(stv);
36  const auto val_ptr = boost::get<T>(stv);
37  CHECK(val_ptr);
38  return *val_ptr;
39 }
40 
42  const std::vector<TargetValue>& row,
43  const SQLTypeInfo& ti,
44  const bool has_nulls) {
45  switch (ti.get_type()) {
46  case kBOOLEAN:
47  case kTINYINT:
48  case kSMALLINT:
49  case kINT:
50  case kBIGINT:
51  case kNUMERIC:
52  case kDECIMAL:
53  case kTIME:
54  case kTIMESTAMP:
55  case kDATE: {
56  int64_t min_val = read_scalar_target_value<int64_t>(row[0]);
57  int64_t max_val = read_scalar_target_value<int64_t>(row[1]);
58  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
59  break;
60  }
61  case kFLOAT: {
62  float min_val = read_scalar_target_value<float>(row[0]);
63  float max_val = read_scalar_target_value<float>(row[1]);
64  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
65  break;
66  }
67  case kDOUBLE: {
68  double min_val = read_scalar_target_value<double>(row[0]);
69  double max_val = read_scalar_target_value<double>(row[1]);
70  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
71  break;
72  }
73  case kVARCHAR:
74  case kCHAR:
75  case kTEXT:
76  if (ti.get_compression() == kENCODING_DICT) {
77  int64_t min_val = read_scalar_target_value<int64_t>(row[0]);
78  int64_t max_val = read_scalar_target_value<int64_t>(row[1]);
79  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
80  }
81  break;
82  default: {
83  return false; // skip column
84  }
85  }
86  return true;
87 }
88 
90  const std::shared_ptr<const InputColDescriptor> input_col_desc,
91  const std::vector<Analyzer::Expr*>& target_exprs) {
92  return RelAlgExecutionUnit{{input_col_desc->getScanDesc()},
93  {input_col_desc},
94  {},
95  {},
96  {},
97  {},
98  target_exprs,
99  nullptr,
100  SortInfo{{}, SortAlgorithm::Default, 0, 0},
101  0};
102 }
103 
105  return CompilationOptions{device_type, false, ExecutorOptLevel::Default, false};
106 }
107 
109  return ExecutionOptions{
110  false, false, false, false, false, false, false, false, 0, false, false, 0, false};
111 }
112 
113 } // namespace
114 
116  INJECT_TIMER(optimizeMetadata);
117  mapd_unique_lock<mapd_shared_mutex> lock(executor_->execute_mutex_);
118 
119  LOG(INFO) << "Recomputing metadata for " << td_->tableName;
120 
121  CHECK_GE(td_->tableId, 0);
122 
123  std::vector<const TableDescriptor*> table_descriptors;
124  if (td_->nShards > 0) {
125  const auto physical_tds = cat_.getPhysicalTablesDescriptors(td_);
126  table_descriptors.insert(
127  table_descriptors.begin(), physical_tds.begin(), physical_tds.end());
128  } else {
129  table_descriptors.push_back(td_);
130  }
131 
132  auto& data_mgr = cat_.getDataMgr();
133 
134  for (const auto td : table_descriptors) {
135  ScopeGuard row_set_holder = [this] { executor_->row_set_mem_owner_ = nullptr; };
136  // We can use a smaller block size here, since we won't be running projection queries
137  executor_->row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(1000000000);
138  executor_->catalog_ = &cat_;
139  const auto table_id = td->tableId;
140 
141  std::unordered_map</*fragment_id*/ int, size_t> tuple_count_map;
142 
143  // Special case handle $deleted column if it exists
144  // whilst handling the delete column also capture
145  // the number of non deleted rows per fragment
146  if (td->hasDeletedCol) {
147  auto cd = cat_.getDeletedColumn(td);
148  const auto column_id = cd->columnId;
149 
150  const auto input_col_desc =
151  std::make_shared<const InputColDescriptor>(column_id, table_id, 0);
152  const auto col_expr =
153  makeExpr<Analyzer::ColumnVar>(cd->columnType, table_id, column_id, 0);
154  const auto count_expr =
155  makeExpr<Analyzer::AggExpr>(cd->columnType, kCOUNT, col_expr, false, nullptr);
156 
157  const auto ra_exe_unit = build_ra_exe_unit(input_col_desc, {count_expr.get()});
158  const auto table_infos = get_table_infos(ra_exe_unit, executor_);
159  CHECK_EQ(table_infos.size(), size_t(1));
160 
162  const auto eo = get_execution_options();
163 
164  std::unordered_map</*fragment_id*/ int, ChunkStats> stats_map;
165 
166  size_t total_num_tuples = 0;
167  Executor::PerFragmentCallBack compute_deleted_callback =
168  [&stats_map, &tuple_count_map, &total_num_tuples, cd](
169  ResultSetPtr results,
170  const Fragmenter_Namespace::FragmentInfo& fragment_info) {
171  // count number of tuples in $deleted as total number of tuples in table.
172  if (cd->isDeletedCol) {
173  total_num_tuples += fragment_info.getPhysicalNumTuples();
174  }
175  if (fragment_info.getPhysicalNumTuples() == 0) {
176  // TODO(adb): Should not happen, but just to be safe...
177  LOG(WARNING) << "Skipping completely empty fragment for column "
178  << cd->columnName;
179  return;
180  }
181 
182  const auto row = results->getNextRow(false, false);
183  CHECK_EQ(row.size(), size_t(1));
184 
185  const auto& ti = cd->columnType;
186 
187  auto chunk_metadata = std::make_shared<ChunkMetadata>();
188  chunk_metadata->sqlType = get_logical_type_info(ti);
189 
190  const auto count_val = read_scalar_target_value<int64_t>(row[0]);
191  if (count_val == 0) {
192  // Assume chunk of all nulls, bail
193  return;
194  }
195 
196  // min element 0 max element 1
197  std::vector<TargetValue> fakerow;
198 
199  auto num_tuples = static_cast<size_t>(count_val);
200 
201  // calculate min
202  if (num_tuples == fragment_info.getPhysicalNumTuples()) {
203  // nothing deleted
204  // min = false;
205  // max = false;
206  fakerow.emplace_back(TargetValue{int64_t(0)});
207  fakerow.emplace_back(TargetValue{int64_t(0)});
208  } else {
209  if (num_tuples == 0) {
210  // everything marked as delete
211  // min = true
212  // max = true
213  fakerow.emplace_back(TargetValue{int64_t(1)});
214  fakerow.emplace_back(TargetValue{int64_t(1)});
215  } else {
216  // some deleted
217  // min = false
218  // max = true;
219  fakerow.emplace_back(TargetValue{int64_t(0)});
220  fakerow.emplace_back(TargetValue{int64_t(1)});
221  }
222  }
223 
224  // place manufacture min and max in fake row to use common infra
225  if (!set_metadata_from_results(*chunk_metadata, fakerow, ti, false)) {
226  LOG(WARNING) << "Unable to process new metadata values for column "
227  << cd->columnName;
228  return;
229  }
230 
231  stats_map.emplace(
232  std::make_pair(fragment_info.fragmentId, chunk_metadata->chunkStats));
233  tuple_count_map.emplace(std::make_pair(fragment_info.fragmentId, num_tuples));
234  };
235 
236  executor_->executeWorkUnitPerFragment(
237  ra_exe_unit, table_infos[0], co, eo, cat_, compute_deleted_callback);
238 
239  auto* fragmenter = td->fragmenter.get();
240  CHECK(fragmenter);
241  fragmenter->updateChunkStats(cd, stats_map);
242  fragmenter->setNumRows(total_num_tuples);
243  } // finished special handling deleted column;
244 
245  // TODO(adb): Support geo
246  auto col_descs = cat_.getAllColumnMetadataForTable(table_id, false, false, false);
247  for (const auto& cd : col_descs) {
248  const auto ti = cd->columnType;
249  const auto column_id = cd->columnId;
250 
251  if (ti.is_varlen()) {
252  LOG(INFO) << "Skipping varlen column " << cd->columnName;
253  continue;
254  }
255 
256  const auto input_col_desc =
257  std::make_shared<const InputColDescriptor>(column_id, table_id, 0);
258  const auto col_expr =
259  makeExpr<Analyzer::ColumnVar>(cd->columnType, table_id, column_id, 0);
260  auto max_expr =
261  makeExpr<Analyzer::AggExpr>(cd->columnType, kMAX, col_expr, false, nullptr);
262  auto min_expr =
263  makeExpr<Analyzer::AggExpr>(cd->columnType, kMIN, col_expr, false, nullptr);
264  auto count_expr =
265  makeExpr<Analyzer::AggExpr>(cd->columnType, kCOUNT, col_expr, false, nullptr);
266 
267  if (ti.is_string()) {
268  const SQLTypeInfo fun_ti(kINT);
269  const auto fun_expr = makeExpr<Analyzer::KeyForStringExpr>(col_expr);
270  max_expr = makeExpr<Analyzer::AggExpr>(fun_ti, kMAX, fun_expr, false, nullptr);
271  min_expr = makeExpr<Analyzer::AggExpr>(fun_ti, kMIN, fun_expr, false, nullptr);
272  }
273  const auto ra_exe_unit = build_ra_exe_unit(
274  input_col_desc, {min_expr.get(), max_expr.get(), count_expr.get()});
275  const auto table_infos = get_table_infos(ra_exe_unit, executor_);
276  CHECK_EQ(table_infos.size(), size_t(1));
277 
279  const auto eo = get_execution_options();
280 
281  std::unordered_map</*fragment_id*/ int, ChunkStats> stats_map;
282 
283  Executor::PerFragmentCallBack compute_metadata_callback =
284  [&stats_map, &tuple_count_map, cd](
285  ResultSetPtr results,
286  const Fragmenter_Namespace::FragmentInfo& fragment_info) {
287  if (fragment_info.getPhysicalNumTuples() == 0) {
288  // TODO(adb): Should not happen, but just to be safe...
289  LOG(WARNING) << "Skipping completely empty fragment for column "
290  << cd->columnName;
291  return;
292  }
293 
294  const auto row = results->getNextRow(false, false);
295  CHECK_EQ(row.size(), size_t(3));
296 
297  const auto& ti = cd->columnType;
298 
299  auto chunk_metadata = std::make_shared<ChunkMetadata>();
300  chunk_metadata->sqlType = get_logical_type_info(ti);
301 
302  const auto count_val = read_scalar_target_value<int64_t>(row[2]);
303  if (count_val == 0) {
304  // Assume chunk of all nulls, bail
305  return;
306  }
307 
308  bool has_nulls = true; // default to wide
309  auto tuple_count_itr = tuple_count_map.find(fragment_info.fragmentId);
310  if (tuple_count_itr != tuple_count_map.end()) {
311  has_nulls = !(static_cast<size_t>(count_val) == tuple_count_itr->second);
312  } else {
313  // no deleted column calc so use raw physical count
314  has_nulls = !(static_cast<size_t>(count_val) ==
315  fragment_info.getPhysicalNumTuples());
316  }
317 
318  if (!set_metadata_from_results(*chunk_metadata, row, ti, has_nulls)) {
319  LOG(WARNING) << "Unable to process new metadata values for column "
320  << cd->columnName;
321  return;
322  }
323 
324  stats_map.emplace(
325  std::make_pair(fragment_info.fragmentId, chunk_metadata->chunkStats));
326  };
327 
328  executor_->executeWorkUnitPerFragment(
329  ra_exe_unit, table_infos[0], co, eo, cat_, compute_metadata_callback);
330 
331  auto* fragmenter = td->fragmenter.get();
332  CHECK(fragmenter);
333  fragmenter->updateChunkStats(cd, stats_map);
334  }
335  data_mgr.checkpoint(cat_.getCurrentDB().dbId, table_id);
336  executor_->clearMetaInfoCache();
337  }
338 
339  data_mgr.clearMemory(Data_Namespace::MemoryLevel::CPU_LEVEL);
340  if (data_mgr.gpusPresent()) {
341  data_mgr.clearMemory(Data_Namespace::MemoryLevel::GPU_LEVEL);
342  }
343 }
344 
346  const auto table_id = td_->tableId;
347  cat_.vacuumDeletedRows(table_id);
348  cat_.checkpoint(table_id);
349 }
Defines data structures for the semantic analysis phase of query processing.
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::string cat(Ts &&...args)
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:86
Definition: sqltypes.h:50
std::string tableName
const ColumnDescriptor * getDeletedColumn(const TableDescriptor *td) const
Definition: Catalog.cpp:2742
ExecutorDeviceType
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:195
#define LOG(tag)
Definition: Logger.h:188
std::vector< const TableDescriptor * > getPhysicalTablesDescriptors(const TableDescriptor *logicalTableDesc) const
Definition: Catalog.cpp:3494
CompilationOptions get_compilation_options(const ExecutorDeviceType &device_type)
void fillChunkStats(const T min, const T max, const bool has_nulls)
Definition: ChunkMetadata.h:49
#define CHECK_GE(x, y)
Definition: Logger.h:210
SQLTypeInfo get_logical_type_info(const SQLTypeInfo &type_info)
Definition: sqltypes.h:818
std::shared_ptr< ResultSet > ResultSetPtr
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:258
const TableDescriptor * td_
Definition: sqldefs.h:73
Executor * executor_
T read_scalar_target_value(const TargetValue &tv)
CHECK(cgen_state)
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:194
#define INJECT_TIMER(DESC)
Definition: measure.h:91
Used by Fragmenter classes to store info about each fragment - the fragment id and number of tuples(r...
Definition: Fragmenter.h:78
void vacuumDeletedRows() const
Compacts fragments to remove deleted rows. When a row is deleted, a boolean deleted system column is ...
void checkpoint(const int logicalTableId) const
Definition: Catalog.cpp:3598
Definition: sqltypes.h:53
Definition: sqltypes.h:54
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:266
void vacuumDeletedRows(const TableDescriptor *td) const
Definition: Catalog.cpp:3706
std::list< const ColumnDescriptor * > getAllColumnMetadataForTable(const int tableId, const bool fetchSystemColumns, const bool fetchVirtualColumns, const bool fetchPhysicalColumns) const
Returns a list of pointers to constant ColumnDescriptor structs for all the columns from a particular...
Definition: Catalog.cpp:1704
Definition: sqldefs.h:76
Definition: sqltypes.h:42
bool set_metadata_from_results(ChunkMetadata &chunk_metadata, const std::vector< TargetValue > &row, const SQLTypeInfo &ti, const bool has_nulls)
std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)> PerFragmentCallBack
Definition: Execute.h:454
std::vector< InputTableInfo > get_table_infos(const std::vector< InputDescriptor > &input_descs, Executor *executor)
boost::variant< ScalarTargetValue, ArrayTargetValue, GeoTargetValue, GeoTargetValuePtr > TargetValue
Definition: TargetValue.h:167
TableOptimizer(const TableDescriptor *td, Executor *executor, const Catalog_Namespace::Catalog &cat)
Definition: sqltypes.h:46
specifies the content in-memory of a row in the table metadata table
Definition: sqldefs.h:74
RelAlgExecutionUnit build_ra_exe_unit(const std::shared_ptr< const InputColDescriptor > input_col_desc, const std::vector< Analyzer::Expr * > &target_exprs)
const Catalog_Namespace::Catalog & cat_
void recomputeMetadata() const
Recomputes per-chunk metadata for each fragment in the table. Updates and deletes can cause chunk met...