OmniSciDB  c07336695a
TableOptimizer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TableOptimizer.h"
18 
19 #include <Analyzer/Analyzer.h>
20 #include <Shared/scope.h>
21 
22 namespace {
23 
24 template <typename T>
26  const auto stv = boost::get<ScalarTargetValue>(&tv);
27  CHECK(stv);
28  const auto val_ptr = boost::get<T>(stv);
29  CHECK(val_ptr);
30  return *val_ptr;
31 }
32 
34  const std::vector<TargetValue>& row,
35  const SQLTypeInfo& ti,
36  const bool has_nulls) {
37  switch (ti.get_type()) {
38  case kBOOLEAN:
39  case kTINYINT:
40  case kSMALLINT:
41  case kINT:
42  case kBIGINT:
43  case kNUMERIC:
44  case kDECIMAL:
45  case kTIME:
46  case kTIMESTAMP:
47  case kDATE: {
48  int64_t min_val = read_scalar_target_value<int64_t>(row[0]);
49  int64_t max_val = read_scalar_target_value<int64_t>(row[1]);
50  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
51  break;
52  }
53  case kFLOAT: {
54  float min_val = read_scalar_target_value<float>(row[0]);
55  float max_val = read_scalar_target_value<float>(row[1]);
56  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
57  break;
58  }
59  case kDOUBLE: {
60  double min_val = read_scalar_target_value<double>(row[0]);
61  double max_val = read_scalar_target_value<double>(row[1]);
62  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
63  break;
64  }
65  case kVARCHAR:
66  case kCHAR:
67  case kTEXT:
68  if (ti.get_compression() == kENCODING_DICT) {
69  int64_t min_val = read_scalar_target_value<int64_t>(row[0]);
70  int64_t max_val = read_scalar_target_value<int64_t>(row[1]);
71  chunk_metadata.fillChunkStats(min_val, max_val, has_nulls);
72  }
73  break;
74  default: {
75  return false; // skip column
76  }
77  }
78  return true;
79 }
80 
82  const std::shared_ptr<const InputColDescriptor> input_col_desc,
83  const std::vector<Analyzer::Expr*>& target_exprs) {
84  return RelAlgExecutionUnit{{input_col_desc->getScanDesc()},
85  {input_col_desc},
86  {},
87  {},
88  {},
89  {},
90  target_exprs,
91  nullptr,
93  0};
94 }
95 
97  return CompilationOptions{device_type, false, ExecutorOptLevel::Default, false};
98 }
99 
101  return ExecutionOptions{
102  false, false, false, false, false, false, false, false, 0, false, false, 0};
103 }
104 
105 } // namespace
106 
108  INJECT_TIMER(optimizeMetadata);
109  std::lock_guard<std::mutex> lock(executor_->execute_mutex_);
110 
111  LOG(INFO) << "Recomputing metadata for " << td_->tableName;
112 
113  CHECK_GE(td_->tableId, 0);
114 
115  std::vector<const TableDescriptor*> table_descriptors;
116  if (td_->nShards > 0) {
117  const auto physical_tds = cat_.getPhysicalTablesDescriptors(td_);
118  table_descriptors.insert(
119  table_descriptors.begin(), physical_tds.begin(), physical_tds.end());
120  } else {
121  table_descriptors.push_back(td_);
122  }
123 
124  auto& data_mgr = cat_.getDataMgr();
125 
126  for (const auto td : table_descriptors) {
127  ScopeGuard row_set_holder = [this] { executor_->row_set_mem_owner_ = nullptr; };
128  executor_->row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>();
129  executor_->catalog_ = &cat_;
130  const auto table_id = td->tableId;
131 
132  std::unordered_map</*fragment_id*/ int, size_t> tuple_count_map;
133 
134  // Special case handle $deleted column if it exists
135  // whilst handling the delete column also capture
136  // the number of non deleted rows per fragment
137  if (td->hasDeletedCol) {
138  auto cd = cat_.getDeletedColumn(td);
139  const auto column_id = cd->columnId;
140 
141  const auto input_col_desc =
142  std::make_shared<const InputColDescriptor>(column_id, table_id, 0);
143  const auto col_expr =
144  makeExpr<Analyzer::ColumnVar>(cd->columnType, table_id, column_id, 0);
145  const auto count_expr =
146  makeExpr<Analyzer::AggExpr>(cd->columnType, kCOUNT, col_expr, false, nullptr);
147 
148  const auto ra_exe_unit = build_ra_exe_unit(input_col_desc, {count_expr.get()});
149  const auto table_infos = get_table_infos(ra_exe_unit, executor_);
150  CHECK_EQ(table_infos.size(), size_t(1));
151 
153  const auto eo = get_execution_options();
154 
155  std::unordered_map</*fragment_id*/ int, ChunkStats> stats_map;
156 
157  size_t total_num_tuples = 0;
158  PerFragmentCB compute_deleted_callback =
159  [&stats_map, &tuple_count_map, &total_num_tuples, cd](
160  ResultSetPtr results,
161  const Fragmenter_Namespace::FragmentInfo& fragment_info) {
162  // count number of tuples in $deleted as total number of tuples in table.
163  if (cd->isDeletedCol) {
164  total_num_tuples += fragment_info.getPhysicalNumTuples();
165  }
166  if (fragment_info.getPhysicalNumTuples() == 0) {
167  // TODO(adb): Should not happen, but just to be safe...
168  LOG(WARNING) << "Skipping completely empty fragment for column "
169  << cd->columnName;
170  return;
171  }
172 
173  const auto row = results->getNextRow(false, false);
174  CHECK_EQ(row.size(), size_t(1));
175 
176  const auto& ti = cd->columnType;
177 
178  ChunkMetadata chunk_metadata;
179  chunk_metadata.sqlType = get_logical_type_info(ti);
180 
181  const auto count_val = read_scalar_target_value<int64_t>(row[0]);
182  if (count_val == 0) {
183  // Assume chunk of all nulls, bail
184  return;
185  }
186 
187  // min element 0 max element 1
188  std::vector<TargetValue> fakerow;
189 
190  auto num_tuples = static_cast<size_t>(count_val);
191 
192  // calculate min
193  if (num_tuples == fragment_info.getPhysicalNumTuples()) {
194  // nothing deleted
195  // min = false;
196  // max = false;
197  fakerow.emplace_back(TargetValue{int64_t(0)});
198  fakerow.emplace_back(TargetValue{int64_t(0)});
199  } else {
200  if (num_tuples == 0) {
201  // everything marked as delete
202  // min = true
203  // max = true
204  fakerow.emplace_back(TargetValue{int64_t(1)});
205  fakerow.emplace_back(TargetValue{int64_t(1)});
206  } else {
207  // some deleted
208  // min = false
209  // max = true;
210  fakerow.emplace_back(TargetValue{int64_t(0)});
211  fakerow.emplace_back(TargetValue{int64_t(1)});
212  }
213  }
214 
215  // place manufacture min and max in fake row to use common infra
216  if (!set_metadata_from_results(chunk_metadata, fakerow, ti, false)) {
217  LOG(WARNING) << "Unable to process new metadata values for column "
218  << cd->columnName;
219  return;
220  }
221 
222  stats_map.emplace(
223  std::make_pair(fragment_info.fragmentId, chunk_metadata.chunkStats));
224  tuple_count_map.emplace(std::make_pair(fragment_info.fragmentId, num_tuples));
225  };
226 
227  executor_->executeWorkUnitPerFragment(
228  ra_exe_unit, table_infos[0], co, eo, cat_, compute_deleted_callback);
229 
230  auto* fragmenter = td->fragmenter;
231  CHECK(fragmenter);
232  fragmenter->updateChunkStats(cd, stats_map);
233  fragmenter->setNumRows(total_num_tuples);
234  } // finished special handling deleted column;
235 
236  // TODO(adb): Support geo
237  auto col_descs = cat_.getAllColumnMetadataForTable(table_id, false, false, false);
238  for (const auto& cd : col_descs) {
239  const auto ti = cd->columnType;
240  const auto column_id = cd->columnId;
241 
242  if (ti.is_varlen()) {
243  LOG(INFO) << "Skipping varlen column " << cd->columnName;
244  continue;
245  }
246 
247  const auto input_col_desc =
248  std::make_shared<const InputColDescriptor>(column_id, table_id, 0);
249  const auto col_expr =
250  makeExpr<Analyzer::ColumnVar>(cd->columnType, table_id, column_id, 0);
251  auto max_expr =
252  makeExpr<Analyzer::AggExpr>(cd->columnType, kMAX, col_expr, false, nullptr);
253  auto min_expr =
254  makeExpr<Analyzer::AggExpr>(cd->columnType, kMIN, col_expr, false, nullptr);
255  auto count_expr =
256  makeExpr<Analyzer::AggExpr>(cd->columnType, kCOUNT, col_expr, false, nullptr);
257 
258  if (ti.is_string()) {
259  const SQLTypeInfo fun_ti(kINT);
260  const auto fun_expr = makeExpr<Analyzer::KeyForStringExpr>(col_expr);
261  max_expr = makeExpr<Analyzer::AggExpr>(fun_ti, kMAX, fun_expr, false, nullptr);
262  min_expr = makeExpr<Analyzer::AggExpr>(fun_ti, kMIN, fun_expr, false, nullptr);
263  }
264  const auto ra_exe_unit = build_ra_exe_unit(
265  input_col_desc, {min_expr.get(), max_expr.get(), count_expr.get()});
266  const auto table_infos = get_table_infos(ra_exe_unit, executor_);
267  CHECK_EQ(table_infos.size(), size_t(1));
268 
270  const auto eo = get_execution_options();
271 
272  std::unordered_map</*fragment_id*/ int, ChunkStats> stats_map;
273 
274  PerFragmentCB compute_metadata_callback =
275  [&stats_map, &tuple_count_map, cd](
276  ResultSetPtr results,
277  const Fragmenter_Namespace::FragmentInfo& fragment_info) {
278  if (fragment_info.getPhysicalNumTuples() == 0) {
279  // TODO(adb): Should not happen, but just to be safe...
280  LOG(WARNING) << "Skipping completely empty fragment for column "
281  << cd->columnName;
282  return;
283  }
284 
285  const auto row = results->getNextRow(false, false);
286  CHECK_EQ(row.size(), size_t(3));
287 
288  const auto& ti = cd->columnType;
289 
290  ChunkMetadata chunk_metadata;
291  chunk_metadata.sqlType = get_logical_type_info(ti);
292 
293  const auto count_val = read_scalar_target_value<int64_t>(row[2]);
294  if (count_val == 0) {
295  // Assume chunk of all nulls, bail
296  return;
297  }
298 
299  bool has_nulls = true; // default to wide
300  auto tuple_count_itr = tuple_count_map.find(fragment_info.fragmentId);
301  if (tuple_count_itr != tuple_count_map.end()) {
302  has_nulls = !(static_cast<size_t>(count_val) == tuple_count_itr->second);
303  } else {
304  // no deleted column calc so use raw physical count
305  has_nulls = !(static_cast<size_t>(count_val) ==
306  fragment_info.getPhysicalNumTuples());
307  }
308 
309  if (!set_metadata_from_results(chunk_metadata, row, ti, has_nulls)) {
310  LOG(WARNING) << "Unable to process new metadata values for column "
311  << cd->columnName;
312  return;
313  }
314 
315  stats_map.emplace(
316  std::make_pair(fragment_info.fragmentId, chunk_metadata.chunkStats));
317  };
318 
319  executor_->executeWorkUnitPerFragment(
320  ra_exe_unit, table_infos[0], co, eo, cat_, compute_metadata_callback);
321 
322  auto* fragmenter = td->fragmenter;
323  CHECK(fragmenter);
324  fragmenter->updateChunkStats(cd, stats_map);
325  }
326  data_mgr.checkpoint(cat_.getCurrentDB().dbId, table_id);
327  executor_->clearMetaInfoCache();
328  }
329 
330  data_mgr.clearMemory(Data_Namespace::MemoryLevel::CPU_LEVEL);
331  if (data_mgr.gpusPresent()) {
332  data_mgr.clearMemory(Data_Namespace::MemoryLevel::GPU_LEVEL);
333  }
334 }
335 
337  const auto table_id = td_->tableId;
338  cat_.vacuumDeletedRows(table_id);
339  cat_.checkpoint(table_id);
340 }
Defines data structures for the semantic analysis phase of query processing.
#define CHECK_EQ(x, y)
Definition: Logger.h:195
Definition: sqltypes.h:51
RelAlgExecutionUnit build_ra_exe_unit(const std::shared_ptr< const InputColDescriptor > input_col_desc, const std::vector< Analyzer::Expr *> &target_exprs)
std::string tableName
ExecutorDeviceType
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:177
#define LOG(tag)
Definition: Logger.h:182
CompilationOptions get_compilation_options(const ExecutorDeviceType &device_type)
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:319
void fillChunkStats(const T min, const T max, const bool has_nulls)
Definition: ChunkMetadata.h:38
#define CHECK_GE(x, y)
Definition: Logger.h:200
SQLTypeInfo get_logical_type_info(const SQLTypeInfo &type_info)
Definition: sqltypes.h:836
std::shared_ptr< ResultSet > ResultSetPtr
std::vector< const TableDescriptor * > getPhysicalTablesDescriptors(const TableDescriptor *logicalTableDesc) const
Definition: Catalog.cpp:2895
void vacuumDeletedRows(const TableDescriptor *td) const
Definition: Catalog.cpp:3028
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:327
ChunkStats chunkStats
Definition: ChunkMetadata.h:35
const TableDescriptor * td_
Definition: sqldefs.h:71
Executor * executor_
T read_scalar_target_value(const TargetValue &tv)
void checkpoint(const int logicalTableId) const
Definition: Catalog.cpp:2927
#define INJECT_TIMER(DESC)
Definition: measure.h:91
Used by Fragmenter classes to store info about each fragment - the fragment id and number of tuples(r...
Definition: Fragmenter.h:79
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:176
void vacuumDeletedRows() const
Compacts fragments to remove deleted rows. When a row is deleted, a boolean deleted system column is ...
Definition: sqltypes.h:54
Definition: sqltypes.h:55
void recomputeMetadata() const
Recomputes per-chunk metadata for each fragment in the table. Updates and deletes can cause chunk met...
std::list< const ColumnDescriptor * > getAllColumnMetadataForTable(const int tableId, const bool fetchSystemColumns, const bool fetchVirtualColumns, const bool fetchPhysicalColumns) const
Returns a list of pointers to constant ColumnDescriptor structs for all the columns from a particular...
Definition: Catalog.cpp:1579
Definition: sqldefs.h:71
Definition: sqltypes.h:43
const ColumnDescriptor * getDeletedColumn(const TableDescriptor *td) const
Definition: Catalog.cpp:2177
bool set_metadata_from_results(ChunkMetadata &chunk_metadata, const std::vector< TargetValue > &row, const SQLTypeInfo &ti, const bool has_nulls)
std::function< void(ResultSetPtr, const Fragmenter_Namespace::FragmentInfo &)> PerFragmentCB
Definition: Execute.h:324
#define CHECK(condition)
Definition: Logger.h:187
std::vector< InputTableInfo > get_table_infos(const std::vector< InputDescriptor > &input_descs, Executor *executor)
boost::variant< ScalarTargetValue, ArrayTargetValue, GeoTargetValue, GeoTargetValuePtr > TargetValue
Definition: TargetValue.h:167
Definition: sqltypes.h:47
Definition: sqldefs.h:71
SQLTypeInfo sqlType
Definition: ChunkMetadata.h:32
const Catalog_Namespace::Catalog & cat_