OmniSciDB  16c4e035a1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TableArchiver Class Reference

#include <TableArchiver.h>

+ Collaboration diagram for TableArchiver:

Public Member Functions

 TableArchiver (Catalog_Namespace::Catalog *cat)
 
void dumpTable (const TableDescriptor *td, const std::string &archive_path, const std::string &compression)
 
void restoreTable (const Catalog_Namespace::SessionInfo &session, const TableDescriptor *td, const std::string &archive_path, const std::string &compression)
 
void restoreTable (const Catalog_Namespace::SessionInfo &session, const std::string &table_name, const std::string &archive_path, const std::string &compression)
 

Private Attributes

Catalog_Namespace::Catalogcat_
 

Detailed Description

Definition at line 24 of file TableArchiver.h.

Constructor & Destructor Documentation

TableArchiver::TableArchiver ( Catalog_Namespace::Catalog cat)
inline

Definition at line 26 of file TableArchiver.h.

26 : cat_(cat){};
Catalog_Namespace::Catalog * cat_
Definition: TableArchiver.h:43

Member Function Documentation

void TableArchiver::dumpTable ( const TableDescriptor td,
const std::string &  archive_path,
const std::string &  compression 
)

Definition at line 259 of file TableArchiver.cpp.

References anonymous_namespace{TableArchiver.cpp}::abs_path(), cat_, Catalog_Namespace::DBMetadata::dbId, Data_Namespace::DISK_LEVEL, Catalog_Namespace::Catalog::dumpSchema(), ddl_utils::EXPORT, omnisci::fopen(), g_cluster, get_quoted_string(), Catalog_Namespace::Catalog::getAllColumnMetadataForTable(), Catalog_Namespace::Catalog::getColumnDictDirectory(), Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Data_Namespace::DataMgr::getGlobalFileMgr(), Catalog_Namespace::Catalog::getTableDataDirectories(), Catalog_Namespace::Catalog::getTableDictDirectories(), Catalog_Namespace::Catalog::getTableEpoch(), TableDescriptor::is_system_table, TableDescriptor::isView, join(), TableDescriptor::persistenceLevel, run, anonymous_namespace{TableArchiver.cpp}::simple_file_closer, table_epoch_filename, table_oldinfo_filename, table_schema_filename, TableDescriptor::tableId, TableDescriptor::tableName, to_string(), shared::transform(), and ddl_utils::validate_allowed_file_path().

Referenced by Parser::DumpTableStmt::execute().

261  {
262  if (td->is_system_table) {
263  throw std::runtime_error("Dumping a system table is not supported.");
264  }
267  if (g_cluster) {
268  throw std::runtime_error("DUMP/RESTORE is not supported yet on distributed setup.");
269  }
270  if (boost::filesystem::exists(archive_path)) {
271  throw std::runtime_error("Archive " + archive_path + " already exists.");
272  }
274  throw std::runtime_error("Dumping view or temporary table is not supported.");
275  }
276  // collect paths of files to archive
277  const auto global_file_mgr = cat_->getDataMgr().getGlobalFileMgr();
278  std::vector<std::string> file_paths;
279  auto file_writer = [&file_paths, global_file_mgr](const std::string& file_name,
280  const std::string& file_type,
281  const std::string& file_data) {
282  const auto file_path = abs_path(global_file_mgr) + "/" + file_name;
283  std::unique_ptr<FILE, decltype(simple_file_closer)> fp(
284  std::fopen(file_path.c_str(), "w"), simple_file_closer);
285  if (!fp) {
286  throw std::runtime_error("Failed to create " + file_type + " file '" + file_path +
287  "': " + std::strerror(errno));
288  }
289  if (std::fwrite(file_data.data(), 1, file_data.size(), fp.get()) < file_data.size()) {
290  throw std::runtime_error("Failed to write " + file_type + " file '" + file_path +
291  "': " + std::strerror(errno));
292  }
293  file_paths.push_back(file_name);
294  };
295 
296  const auto table_name = td->tableName;
297  {
298  // - gen schema file
299  const auto schema_str = cat_->dumpSchema(td);
300  file_writer(table_schema_filename, "table schema", schema_str);
301  // - gen column-old-info file
302  const auto cds = cat_->getAllColumnMetadataForTable(td->tableId, true, true, true);
303  std::vector<std::string> column_oldinfo;
304  std::transform(cds.begin(),
305  cds.end(),
306  std::back_inserter(column_oldinfo),
307  [&](const auto cd) -> std::string {
308  return cd->columnName + ":" + std::to_string(cd->columnId) + ":" +
310  });
311  const auto column_oldinfo_str = boost::algorithm::join(column_oldinfo, " ");
312  file_writer(table_oldinfo_filename, "table old info", column_oldinfo_str);
313  // - gen table epoch
314  const auto epoch = cat_->getTableEpoch(cat_->getCurrentDB().dbId, td->tableId);
315  file_writer(table_epoch_filename, "table epoch", std::to_string(epoch));
316  // - collect table data file paths ...
317  const auto data_file_dirs = cat_->getTableDataDirectories(td);
318  file_paths.insert(file_paths.end(), data_file_dirs.begin(), data_file_dirs.end());
319  // - collect table dict file paths ...
320  const auto dict_file_dirs = cat_->getTableDictDirectories(td);
321  file_paths.insert(file_paths.end(), dict_file_dirs.begin(), dict_file_dirs.end());
322  // tar takes time. release cat lock to yield the cat to concurrent CREATE statements.
323  }
324  // run tar to archive the files ... this may take a while !!
325  run("tar " + compression + " -cvf " + get_quoted_string(archive_path) + " " +
326  boost::algorithm::join(file_paths, " "),
327  abs_path(global_file_mgr));
328 }
std::string getColumnDictDirectory(const ColumnDescriptor *cd, bool file_name_only=true) const
Definition: Catalog.cpp:4466
static constexpr char const * table_schema_filename
std::string tableName
std::string abs_path(const File_Namespace::GlobalFileMgr *global_file_mgr)
::FILE * fopen(const char *filename, const char *mode)
Definition: omnisci_fs.cpp:72
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:226
std::string join(T const &container, std::string const &delim)
static constexpr char const * table_oldinfo_filename
int32_t getTableEpoch(const int32_t db_id, const int32_t table_id) const
Definition: Catalog.cpp:2885
std::vector< std::string > getTableDataDirectories(const TableDescriptor *td) const
Definition: Catalog.cpp:4452
std::string to_string(char const *&&v)
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:225
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:290
File_Namespace::GlobalFileMgr * getGlobalFileMgr() const
Definition: DataMgr.cpp:616
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:770
std::vector< std::string > getTableDictDirectories(const TableDescriptor *td) const
Definition: Catalog.cpp:4487
std::list< const ColumnDescriptor * > getAllColumnMetadataForTable(const int tableId, const bool fetchSystemColumns, const bool fetchVirtualColumns, const bool fetchPhysicalColumns) const
Returns a list of pointers to constant ColumnDescriptor structs for all the columns from a particular...
Definition: Catalog.cpp:1811
Data_Namespace::MemoryLevel persistenceLevel
static constexpr char const * table_epoch_filename
bool g_cluster
static bool run
std::string dumpSchema(const TableDescriptor *td) const
Definition: Catalog.cpp:4515
Catalog_Namespace::Catalog * cat_
Definition: TableArchiver.h:43

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void TableArchiver::restoreTable ( const Catalog_Namespace::SessionInfo session,
const TableDescriptor td,
const std::string &  archive_path,
const std::string &  compression 
)

Definition at line 331 of file TableArchiver.cpp.

References anonymous_namespace{Utm.h}::a, anonymous_namespace{TableArchiver.cpp}::abs_path(), anonymous_namespace{TableArchiver.cpp}::adjust_altered_table_files(), cat_, Catalog_Namespace::DBMetadata::dbId, Data_Namespace::DISK_LEVEL, measure< TimeT >::execution(), g_cluster, g_test_rollback_dump_restore, get_quoted_string(), anonymous_namespace{TableArchiver.cpp}::get_table_schema(), Catalog_Namespace::Catalog::getAllColumnMetadataForTable(), Catalog_Namespace::Catalog::getColumnDictDirectory(), Catalog_Namespace::Catalog::getCurrentDB(), Catalog_Namespace::Catalog::getDataMgr(), Data_Namespace::DataMgr::getGlobalFileMgr(), lockmgr::TableLockMgrImpl< TableSchemaLockMgr >::getReadLockForTable(), Catalog_Namespace::Catalog::getTableDataDirectories(), Catalog_Namespace::Catalog::getTableDictDirectories(), lockmgr::TableLockMgrImpl< InsertDataLockMgr >::getWriteLockForTable(), TableDescriptor::hasDeletedCol, ddl_utils::IMPORT, TableDescriptor::isView, join(), TableDescriptor::nShards, TableDescriptor::persistenceLevel, anonymous_namespace{TableArchiver.cpp}::rename_table_directories(), run, Catalog_Namespace::Catalog::setTableEpoch(), anonymous_namespace{TableArchiver.cpp}::simple_file_cat(), split(), table_epoch_filename, table_oldinfo_filename, table_schema_filename, TableDescriptor::tableId, TableDescriptor::tableName, to_string(), shared::transform(), ddl_utils::validate_allowed_file_path(), and VLOG.

Referenced by Parser::RestoreTableStmt::execute(), and restoreTable().

334  {
337  if (g_cluster) {
338  throw std::runtime_error("DUMP/RESTORE is not supported yet on distributed setup.");
339  }
340  if (!boost::filesystem::exists(archive_path)) {
341  throw std::runtime_error("Archive " + archive_path + " does not exist.");
342  }
344  throw std::runtime_error("Restoring view or temporary table is not supported.");
345  }
346  // Obtain table schema read lock to prevent modification of the schema during
347  // restoration
348  const auto table_read_lock =
350  // prevent concurrent inserts into table during restoration
351  const auto insert_data_lock =
353 
354  // untar takes time. no grab of cat lock to yield to concurrent CREATE stmts.
355  const auto global_file_mgr = cat_->getDataMgr().getGlobalFileMgr();
356  // dirs where src files are untarred and dst files are backed up
357  constexpr static const auto temp_data_basename = "_data";
358  constexpr static const auto temp_back_basename = "_back";
359  const auto temp_data_dir = abs_path(global_file_mgr) + "/" + temp_data_basename;
360  const auto temp_back_dir = abs_path(global_file_mgr) + "/" + temp_back_basename;
361  // clean up tmp dirs and files in any case
362  auto tmp_files_cleaner = [&](void*) {
363  run("rm -rf " + temp_data_dir + " " + temp_back_dir);
364  run("rm -f " + abs_path(global_file_mgr) + "/" + table_schema_filename);
365  run("rm -f " + abs_path(global_file_mgr) + "/" + table_oldinfo_filename);
366  run("rm -f " + abs_path(global_file_mgr) + "/" + table_epoch_filename);
367  };
368  std::unique_ptr<decltype(tmp_files_cleaner), decltype(tmp_files_cleaner)> tfc(
369  &tmp_files_cleaner, tmp_files_cleaner);
370  // extract & parse schema
371  const auto schema_str = get_table_schema(archive_path, td->tableName, compression);
372  const auto create_table_stmt =
373  Parser::parseDDL<Parser::CreateTableStmt>("table schema", schema_str);
374  // verify compatibility between source and destination schemas
375  TableDescriptor src_td;
376  std::list<ColumnDescriptor> src_columns;
377  std::vector<Parser::SharedDictionaryDef> shared_dict_defs;
378  create_table_stmt->executeDryRun(session, src_td, src_columns, shared_dict_defs);
379  // - sanity check table-level compatibility
380  if (src_td.hasDeletedCol != td->hasDeletedCol) {
381  // TODO: allow the case, in which src data enables vacuum while
382  // dst doesn't, by simply discarding src $deleted column data.
383  throw std::runtime_error("Incompatible table VACCUM option");
384  }
385  if (src_td.nShards != td->nShards) {
386  // TODO: allow different shard numbers if they have a "GCD",
387  // by splitting/merging src data files before drop into dst.
388  throw std::runtime_error("Unmatched number of table shards");
389  }
390  // - sanity check column-level compatibility (based on column names)
391  const auto dst_columns =
392  cat_->getAllColumnMetadataForTable(td->tableId, false, false, false);
393  if (dst_columns.size() != src_columns.size()) {
394  throw std::runtime_error("Unmatched number of table columns");
395  }
396  for (const auto& [src_cd, dst_cd] : boost::combine(src_columns, dst_columns)) {
397  if (src_cd.columnType.get_type_name() != dst_cd->columnType.get_type_name() ||
398  src_cd.columnType.get_compression_name() !=
399  dst_cd->columnType.get_compression_name()) {
400  throw std::runtime_error("Incompatible types on column " + src_cd.columnName);
401  }
402  }
403  // extract src table column ids (ALL columns incl. system/virtual/phy geo cols)
404  const auto all_src_oldinfo_str =
405  simple_file_cat(archive_path, table_oldinfo_filename, compression);
406  std::vector<std::string> src_oldinfo_strs;
407  boost::algorithm::split(src_oldinfo_strs,
408  all_src_oldinfo_str,
409  boost::is_any_of(" "),
410  boost::token_compress_on);
411  auto all_dst_columns =
412  cat_->getAllColumnMetadataForTable(td->tableId, true, true, true);
413  if (src_oldinfo_strs.size() != all_dst_columns.size()) {
414  throw std::runtime_error("Source table has a unmatched number of columns: " +
415  std::to_string(src_oldinfo_strs.size()) + " vs " +
416  std::to_string(all_dst_columns.size()));
417  }
418  // build a map of src column ids and dst column ids, just in case src table has been
419  // ALTERed before and chunk keys of src table needs to be adjusted accordingly.
420  // note: this map is used only for the case of migrating a table and not for restoring
421  // a table. When restoring a table, the two tables must have the same column ids.
422  //
423  // also build a map of src dict paths and dst dict paths for relocating src dicts
424  std::unordered_map<int, int> column_ids_map;
425  std::unordered_map<std::string, std::string> dict_paths_map;
426  // sort inputs of transform in lexical order of column names for correct mappings
427  std::list<std::vector<std::string>> src_oldinfo_tokens;
429  src_oldinfo_strs.begin(),
430  src_oldinfo_strs.end(),
431  std::back_inserter(src_oldinfo_tokens),
432  [](const auto& src_oldinfo_str) -> auto {
433  std::vector<std::string> tokens;
435  tokens, src_oldinfo_str, boost::is_any_of(":"), boost::token_compress_on);
436  return tokens;
437  });
438  src_oldinfo_tokens.sort(
439  [](const auto& lhs, const auto& rhs) { return lhs[0].compare(rhs[0]) < 0; });
440  all_dst_columns.sort(
441  [](auto a, auto b) { return a->columnName.compare(b->columnName) < 0; });
442  // transform inputs into the maps
443  std::transform(src_oldinfo_tokens.begin(),
444  src_oldinfo_tokens.end(),
445  all_dst_columns.begin(),
446  std::inserter(column_ids_map, column_ids_map.end()),
447  [&](const auto& tokens, const auto& cd) -> std::pair<int, int> {
448  VLOG(3) << boost::algorithm::join(tokens, ":") << " ==> "
449  << cd->columnName << ":" << cd->columnId;
450  dict_paths_map[tokens[2]] = cat_->getColumnDictDirectory(cd);
451  return {boost::lexical_cast<int>(tokens[1]), cd->columnId};
452  });
453  bool was_table_altered = false;
454  std::for_each(column_ids_map.begin(), column_ids_map.end(), [&](auto& it) {
455  was_table_altered = was_table_altered || it.first != it.second;
456  });
457  VLOG(3) << "was_table_altered = " << was_table_altered;
458  // extract all data files to a temp dir. will swap with dst table dir after all set,
459  // otherwise will corrupt table in case any bad thing happens in the middle.
460  run("rm -rf " + temp_data_dir);
461  run("mkdir -p " + temp_data_dir);
462  run("tar " + compression + " -xvf " + get_quoted_string(archive_path), temp_data_dir);
463  // if table was ever altered after it was created, update column ids in chunk headers.
464  if (was_table_altered) {
465  const auto time_ms = measure<>::execution(
466  [&]() { adjust_altered_table_files(temp_data_dir, column_ids_map); });
467  VLOG(3) << "adjust_altered_table_files: " << time_ms << " ms";
468  }
469  // finally,,, swap table data/dict dirs!
470  const auto data_file_dirs = cat_->getTableDataDirectories(td);
471  const auto dict_file_dirs = cat_->getTableDictDirectories(td);
472  // move current target dirs, if exists, to backup dir
473  std::vector<std::string> both_file_dirs;
474  std::merge(data_file_dirs.begin(),
475  data_file_dirs.end(),
476  dict_file_dirs.begin(),
477  dict_file_dirs.end(),
478  std::back_inserter(both_file_dirs));
479  bool backup_completed = false;
480  try {
481  run("rm -rf " + temp_back_dir);
482  run("mkdir -p " + temp_back_dir);
483  for (const auto& dir : both_file_dirs) {
484  const auto dir_full_path = abs_path(global_file_mgr) + "/" + dir;
485  if (boost::filesystem::is_directory(dir_full_path)) {
486  run("mv " + dir_full_path + " " + temp_back_dir);
487  }
488  }
489  backup_completed = true;
490  // accord src data dirs to dst
492  cat_->getDataMgr().getGlobalFileMgr(), temp_data_dir, data_file_dirs, "table_");
493  // accord src dict dirs to dst
494  for (const auto& dit : dict_paths_map) {
495  if (!dit.first.empty() && !dit.second.empty()) {
496  const auto src_dict_path = temp_data_dir + "/" + dit.first;
497  const auto dst_dict_path = abs_path(global_file_mgr) + "/" + dit.second;
498  run("mv " + src_dict_path + " " + dst_dict_path);
499  }
500  }
501  // throw if sanity test forces a rollback
503  throw std::runtime_error("lol!");
504  }
505  } catch (...) {
506  // once backup is completed, whatever in abs_path(global_file_mgr) is the "src"
507  // dirs that are to be rolled back and discarded
508  if (backup_completed) {
509  run("rm -rf " + boost::algorithm::join(both_file_dirs, " "),
510  abs_path(global_file_mgr));
511  }
512  // complete rollback by recovering original "dst" table dirs from backup dir
513  boost::filesystem::path base_path(temp_back_dir);
514  boost::filesystem::directory_iterator end_it;
515  for (boost::filesystem::directory_iterator fit(base_path); fit != end_it; ++fit) {
516  run("mv " + fit->path().string() + " .", abs_path(global_file_mgr));
517  }
518  throw;
519  }
520  // set for reloading table from the restored/migrated files
521  const auto epoch = simple_file_cat(archive_path, table_epoch_filename, compression);
523  cat_->getCurrentDB().dbId, td->tableId, boost::lexical_cast<int>(epoch));
524 }
std::string get_table_schema(const std::string &archive_path, const std::string &table, const std::string &compression)
static ReadLock getReadLockForTable(const Catalog_Namespace::Catalog &cat, const std::string &table_name)
Definition: LockMgrImpl.h:164
std::string getColumnDictDirectory(const ColumnDescriptor *cd, bool file_name_only=true) const
Definition: Catalog.cpp:4466
static constexpr char const * table_schema_filename
std::string tableName
static TimeT::rep execution(F func, Args &&...args)
Definition: sample.cpp:29
std::string abs_path(const File_Namespace::GlobalFileMgr *global_file_mgr)
Data_Namespace::DataMgr & getDataMgr() const
Definition: Catalog.h:226
static WriteLock getWriteLockForTable(const Catalog_Namespace::Catalog &cat, const std::string &table_name)
Definition: LockMgrImpl.h:155
void adjust_altered_table_files(const std::string &temp_data_dir, const std::unordered_map< int, int > &column_ids_map)
std::string join(T const &container, std::string const &delim)
static constexpr char const * table_oldinfo_filename
std::vector< std::string > getTableDataDirectories(const TableDescriptor *td) const
Definition: Catalog.cpp:4452
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
constexpr double a
Definition: Utm.h:32
std::string get_quoted_string(const std::string &filename, char quote, char escape)
Quote a string while escaping any existing quotes in the string.
const DBMetadata & getCurrentDB() const
Definition: Catalog.h:225
void rename_table_directories(const File_Namespace::GlobalFileMgr *global_file_mgr, const std::string &temp_data_dir, const std::vector< std::string > &target_paths, const std::string &name_prefix)
OUTPUT transform(INPUT const &input, FUNC const &func)
Definition: misc.h:290
File_Namespace::GlobalFileMgr * getGlobalFileMgr() const
Definition: DataMgr.cpp:616
void validate_allowed_file_path(const std::string &file_path, const DataTransferType data_transfer_type, const bool allow_wildcards)
Definition: DdlUtils.cpp:770
void setTableEpoch(const int db_id, const int table_id, const int new_epoch)
Definition: Catalog.cpp:2936
std::vector< std::string > getTableDictDirectories(const TableDescriptor *td) const
Definition: Catalog.cpp:4487
std::list< const ColumnDescriptor * > getAllColumnMetadataForTable(const int tableId, const bool fetchSystemColumns, const bool fetchVirtualColumns, const bool fetchPhysicalColumns) const
Returns a list of pointers to constant ColumnDescriptor structs for all the columns from a particular...
Definition: Catalog.cpp:1811
Data_Namespace::MemoryLevel persistenceLevel
static constexpr char const * table_epoch_filename
bool g_cluster
static bool run
std::string simple_file_cat(const std::string &archive_path, const std::string &file_name, const std::string &compression)
#define VLOG(n)
Definition: Logger.h:305
bool g_test_rollback_dump_restore
Catalog_Namespace::Catalog * cat_
Definition: TableArchiver.h:43

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void TableArchiver::restoreTable ( const Catalog_Namespace::SessionInfo session,
const std::string &  table_name,
const std::string &  archive_path,
const std::string &  compression 
)

Definition at line 528 of file TableArchiver.cpp.

References cat_, anonymous_namespace{TableArchiver.cpp}::get_table_schema(), Catalog_Namespace::Catalog::getMetadataForTable(), and restoreTable().

531  {
532  // replace table name and drop foreign dict references
533  const auto schema_str = get_table_schema(archive_path, table_name, compression);
534  Parser::parseDDL<Parser::CreateTableStmt>("table schema", schema_str)->execute(session);
535  try {
536  restoreTable(
537  session, cat_->getMetadataForTable(table_name), archive_path, compression);
538  } catch (...) {
539  Parser::parseDDL<Parser::DropTableStmt>("statement",
540  "DROP TABLE IF EXISTS " + table_name + ";")
541  ->execute(session);
542  throw;
543  }
544 }
std::string get_table_schema(const std::string &archive_path, const std::string &table, const std::string &compression)
void restoreTable(const Catalog_Namespace::SessionInfo &session, const TableDescriptor *td, const std::string &archive_path, const std::string &compression)
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
Catalog_Namespace::Catalog * cat_
Definition: TableArchiver.h:43

+ Here is the call graph for this function:

Member Data Documentation

Catalog_Namespace::Catalog* TableArchiver::cat_
private

Definition at line 43 of file TableArchiver.h.

Referenced by dumpTable(), and restoreTable().


The documentation for this class was generated from the following files: