From: Douglas Fuller Date: Thu, 7 Jul 2016 18:39:27 +0000 (-0700) Subject: cephfs: Permit recovering metadata into a new RADOS pool X-Git-Tag: v12.0.2~134^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cb86740a5f4aa3eed43c7f09ac5e7e525a5c1d67;p=ceph.git cephfs: Permit recovering metadata into a new RADOS pool Add a procedure that permits reconstructing metadata in a potentially damaged cephfs metadata pool and writing the results into a freshly-initialized pool that refers to the same data pool. Add option flags to override checks that would ordinarily prevent this and add options to the recovery tools to write output to a separate pool instead of the one selected for recovery. See docs/cephfs/disaster-recovery.rst for details. Fixes: http://tracker.ceph.com/issues/15068 Fixes: http://tracker.ceph.com/issues/15069 Signed-off-by: Douglas Fuller --- diff --git a/doc/cephfs/disaster-recovery.rst b/doc/cephfs/disaster-recovery.rst index c40e0b411a8d..88bc4dd84dd9 100644 --- a/doc/cephfs/disaster-recovery.rst +++ b/doc/cephfs/disaster-recovery.rst @@ -216,3 +216,62 @@ Note that this command acts as a normal CephFS client to find all the files in the filesystem and read their layouts, so the MDS must be up and running. +Using an alternate metadata pool for recovery +--------------------------------------------- + +.. warning:: + + There has not been extensive testing of this procedure. It should be + undertaken with great care. + +If an existing filesystem is damaged and inoperative, it is possible to create +a fresh metadata pool and attempt to reconstruct the filesystem metadata +into this new pool, leaving the old metadata in place. This could be used to +make a safer attempt at recovery since the existing metadata pool would not be +overwritten. + +.. caution:: + + During this process, multiple metadata pools will contain data referring to + the same data pool. Extreme caution must be exercised to avoid changing the + data pool contents while this is the case. Once recovery is complete, the + damaged metadata pool should be deleted. + +To begin this process, first create the fresh metadata pool and initialize +it with empty file system data structures: + +:: + + ceph fs flag set enable_multiple true --yes-i-really-mean-it + ceph osd pool create recovery replicated + ceph fs new recovery-fs recovery --allow-dangerous-metadata-overlay + cephfs-data-scan init --force-init --filesystem recovery-fs --alternate-pool recovery + ceph fs reset recovery-fs --yes-i-realy-mean-it + cephfs-table-tool recovery-fs:all reset session + cephfs-table-tool recovery-fs:all reset snap + cephfs-table-tool recovery-fs:all reset inode + +Next, run the recovery toolset using the --alternate-pool argument to output +results to the alternate pool: + +:: + + cephfs-data-scan scan_extents --alternate-pool recovery --filesystem + cephfs-data-scan scan_inodes --alternate-pool recovery --filesystem --force-corrupt --force-init + +If the damaged filesystem contains dirty journal data, it may be recovered next +with: + +:: + + cephfs-journal-tool --rank=:0 event recover_dentries list --alternate-pool recovery + cephfs-journal-tool --rank recovery-fs:0 journal reset --force + +After recovery, some recovered directories will have incorrect link counts. +Ensure the parameter mds_debug_scatterstat is set to false (the default) to +prevent the MDS from checking the link counts, then run a forward scrub to +repair them. Ensure you have an MDS running and issue: + +:: + + ceph daemon mds.a scrub_path / recursive repair diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index cfb80b49958c..2a495d02169c 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1954,6 +1954,7 @@ void CDir::_go_bad() void CDir::go_bad_dentry(snapid_t last, const std::string &dname) { + dout(10) << "go_bad_dentry " << dname << dendl; const bool fatal = cache->mds->damage_table.notify_dentry( inode->ino(), frag, last, dname); if (fatal) { @@ -1964,6 +1965,7 @@ void CDir::go_bad_dentry(snapid_t last, const std::string &dname) void CDir::go_bad(bool complete) { + dout(10) << "go_bad " << frag << dendl; const bool fatal = cache->mds->damage_table.notify_dirfrag(inode->ino(), frag); if (fatal) { cache->mds->damaged(); diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h index 2c15063202f5..881281edca17 100644 --- a/src/mds/FSMap.h +++ b/src/mds/FSMap.h @@ -434,6 +434,14 @@ public: } return nullptr; } + std::list > get_filesystems(void) const + { + std::list > ret; + for (const auto &i : filesystems) { + ret.push_back(std::const_pointer_cast(i.second)); + } + return ret; + } int parse_filesystem( std::string const &ns_str, diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index b66073519b01..cc2dfe04c0b2 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -151,11 +151,11 @@ class FsNewHandler : public FileSystemCommandHandler return -EINVAL; } - for (auto fs : pending_fsmap.get_filesystems()) { - const set& data_pools = fs.second->mds_map.get_data_pools(); + for (auto fs : fsmap.get_filesystems()) { + const set& data_pools = fs->mds_map.get_data_pools(); string sure; if ((data_pools.find(data) != data_pools.end() - || fs.second->mds_map.metadata_pool == metadata) + || fs->mds_map.get_metadata_pool() == metadata) && ((!cmd_getval(g_ceph_context, cmdmap, "sure", sure) || sure != "--allow-dangerous-metadata-overlay"))) { ss << "Filesystem '" << fs_name diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index f5574ce83d02..5c7949926867 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -332,8 +332,8 @@ COMMAND("fs new " \ "name=fs_name,type=CephString " \ "name=metadata,type=CephString " \ "name=data,type=CephString " \ - "name=force,type=CephChoices,strings=--force,req=false", \ - "name=allow_overlay,type=CephChoices,strings=--allow-dangerous-metadata-overlay,req=false", \ + "name=force,type=CephChoices,strings=--force,req=false " \ + "name=sure,type=CephChoices,strings=--allow-dangerous-metadata-overlay,req=false", \ "make new filesystem using named pools and ", \ "fs", "rw", "cli,rest") COMMAND("fs rm " \ diff --git a/src/tools/cephfs/DataScan.cc b/src/tools/cephfs/DataScan.cc index a1d03c4cde05..29214d32c80d 100644 --- a/src/tools/cephfs/DataScan.cc +++ b/src/tools/cephfs/DataScan.cc @@ -103,6 +103,9 @@ bool DataScan::parse_kwarg( } fscid = fs->fscid; return true; + } else if (arg == std::string("--alternate-pool")) { + metadata_pool_name = val; + return true; } else { return false; } @@ -147,7 +150,6 @@ int DataScan::main(const std::vector &args) std::string const &command = args[0]; std::string data_pool_name; - std::string metadata_pool_name; std::string pg_files_path; std::set pg_files_pgs; @@ -228,7 +230,7 @@ int DataScan::main(const std::vector &args) return r; } - r = driver->init(rados, fsmap, fscid); + r = driver->init(rados, metadata_pool_name, fsmap, fscid); if (r < 0) { return r; } @@ -284,7 +286,6 @@ int DataScan::main(const std::vector &args) int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; - std::string metadata_pool_name; int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); if (r < 0) { std::cerr << "Pool " << metadata_pool_id @@ -493,6 +494,8 @@ int DataScan::scan_extents() uint64_t size; time_t mtime; int r = data_io.stat(oid, &size, &mtime); + dout(10) << "handling object " << obj_name_ino + << "." << obj_name_offset << dendl; if (r != 0) { dout(4) << "Cannot stat '" << oid << "': skipping" << dendl; return r; @@ -655,6 +658,10 @@ int DataScan::scan_inodes() { int r = 0; + dout(10) << "handling object " + << std::hex << obj_name_ino << "." << obj_name_offset << std::dec + << dendl; + AccumulateResult accum_res; inode_backtrace_t backtrace; file_layout_t loaded_layout = file_layout_t::get_default(); @@ -1778,26 +1785,31 @@ int MetadataDriver::inject_linkage( int MetadataDriver::init( - librados::Rados &rados, const FSMap *fsmap, fs_cluster_id_t fscid) + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) { - auto fs = fsmap->get_filesystem(fscid); - assert(fs != nullptr); - int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); + if (metadata_pool_name.empty()) { + auto fs = fsmap->get_filesystem(fscid); + assert(fs != nullptr); + int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool(); - dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; - std::string metadata_pool_name; - int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); - if (r < 0) { - derr << "Pool " << metadata_pool_id - << " identified in MDS map not found in RADOS!" << dendl; - return r; + dout(4) << "resolving metadata pool " << metadata_pool_id << dendl; + int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name); + if (r < 0) { + derr << "Pool " << metadata_pool_id + << " identified in MDS map not found in RADOS!" << dendl; + return r; + } + dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl; + } else { + dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl; } - dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl; return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io); } int LocalFileDriver::init( - librados::Rados &rados, const FSMap *fsmap, fs_cluster_id_t fscid) + librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap, + fs_cluster_id_t fscid) { return 0; } diff --git a/src/tools/cephfs/DataScan.h b/src/tools/cephfs/DataScan.h index 3133be282666..839da1610fe1 100644 --- a/src/tools/cephfs/DataScan.h +++ b/src/tools/cephfs/DataScan.h @@ -30,6 +30,7 @@ class RecoveryDriver { public: virtual int init( librados::Rados &rados, + std::string &metadata_pool_name, const FSMap *fsmap, fs_cluster_id_t fscid) = 0; @@ -118,6 +119,7 @@ class LocalFileDriver : public RecoveryDriver // Implement RecoveryDriver interface int init( librados::Rados &rados, + std::string &metadata_pool_name, const FSMap *fsmap, fs_cluster_id_t fscid) override; @@ -211,6 +213,7 @@ class MetadataDriver : public RecoveryDriver, public MetadataTool // Implement RecoveryDriver interface int init( librados::Rados &rados, + std::string &metadata_pool_name, const FSMap *fsmap, fs_cluster_id_t fscid) override; @@ -241,6 +244,7 @@ class DataScan : public MDSUtility, public MetadataTool librados::IoCtx data_io; // Remember the data pool ID for use in layouts int64_t data_pool_id; + string metadata_pool_name; uint32_t n; uint32_t m; @@ -316,7 +320,8 @@ class DataScan : public MDSUtility, public MetadataTool int main(const std::vector &args); DataScan() - : driver(NULL), fscid(FS_CLUSTER_ID_NONE), data_pool_id(-1), n(0), m(1), + : driver(NULL), fscid(FS_CLUSTER_ID_NONE), + data_pool_id(-1), metadata_pool_name(""), n(0), m(1), force_pool(false), force_corrupt(false), force_init(false) { diff --git a/src/tools/cephfs/JournalTool.cc b/src/tools/cephfs/JournalTool.cc index f47ff6be5960..0f3b15752501 100644 --- a/src/tools/cephfs/JournalTool.cc +++ b/src/tools/cephfs/JournalTool.cc @@ -56,12 +56,15 @@ void JournalTool::usage() << " --inode=\n" << " --type=<\n" << " --frag=. [--dname=]\n" + << " --alternate-pool=pool-name\n" << " --client=\n" << " : [get|apply|recover_dentries|splice]\n" << " : [summary|list|binary|json] [--path ]\n" << "\n" << "Options:\n" - << " --rank= Journal rank (default 0)\n"; + << " --rank=filesystem:mds-rank Journal rank (required if multiple\n" + << " file systems, default is rank 0 on\n" + << " the only filesystem otherwise.\n"; generic_client_usage(); } @@ -93,6 +96,7 @@ int JournalTool::main(std::vector &argv) r = role_selector.parse(*fsmap, rank_str); if (r != 0) { + derr << "Couldn't determine MDS rank." << dendl; return r; } @@ -131,8 +135,9 @@ int JournalTool::main(std::vector &argv) } dout(4) << "JournalTool: creating IoCtx.." << dendl; - r = rados.ioctx_create(pool_name.c_str(), io); + r = rados.ioctx_create(pool_name.c_str(), input); assert(r == 0); + output.dup(input); // Execution // ========= @@ -209,7 +214,7 @@ int JournalTool::main_journal(std::vector &argv) int JournalTool::main_header(std::vector &argv) { JournalFilter filter; - JournalScanner js(io, rank, filter); + JournalScanner js(input, rank, filter); int r = js.scan(false); if (r < 0) { std::cerr << "Unable to scan journal" << std::endl; @@ -280,7 +285,7 @@ int JournalTool::main_header(std::vector &argv) dout(4) << "Writing object..." << dendl; bufferlist header_bl; ::encode(*(js.header), header_bl); - io.write_full(js.obj_name(0), header_bl); + output.write_full(js.obj_name(0), header_bl); dout(4) << "Write complete." << dendl; std::cout << "Successfully updated header." << std::endl; } else { @@ -343,6 +348,12 @@ int JournalTool::main_event(std::vector &argv) std::string arg_str; if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) { output_path = arg_str; + } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool", + nullptr)) { + dout(1) << "Using alternate pool " << arg_str << dendl; + int r = rados.ioctx_create(arg_str.c_str(), output); + assert(r == 0); + other_pool = true; } else { derr << "Unknown argument: '" << *arg << "'" << dendl; usage(); @@ -352,7 +363,7 @@ int JournalTool::main_event(std::vector &argv) // Execute command // =============== - JournalScanner js(io, rank, filter); + JournalScanner js(input, rank, filter); if (command == "get") { r = js.scan(); if (r) { @@ -428,6 +439,24 @@ int JournalTool::main_event(std::vector &argv) } } } + + // Remove consumed dentries from lost+found. + if (other_pool && !dry_run) { + std::set found; + + for (auto i : consumed_inos) { + char s[20]; + + snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i); + dout(20) << "removing " << s << dendl; + found.insert(std::string(s)); + } + + object_t frag_oid; + frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND, + frag_t(), ""); + output.omap_rm_keys(frag_oid.name, found); + } } else if (command == "splice") { r = js.scan(); if (r) { @@ -500,7 +529,7 @@ int JournalTool::journal_inspect() int r; JournalFilter filter; - JournalScanner js(io, rank, filter); + JournalScanner js(input, rank, filter); r = js.scan(); if (r) { std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl; @@ -524,7 +553,7 @@ int JournalTool::journal_inspect() int JournalTool::journal_export(std::string const &path, bool import) { int r = 0; - JournalScanner js(io, rank); + JournalScanner js(input, rank); if (!import) { /* @@ -634,7 +663,7 @@ int JournalTool::scavenge_dentries( // Update fnode in omap header of dirfrag object bool write_fnode = false; bufferlist old_fnode_bl; - r = io.omap_get_header(frag_oid.name, &old_fnode_bl); + r = input.omap_get_header(frag_oid.name, &old_fnode_bl); if (r == -ENOENT) { // Creating dirfrag from scratch dout(4) << "failed to read OMAP header from directory fragment " @@ -664,11 +693,13 @@ int JournalTool::scavenge_dentries( return r; } - if (write_fnode && !dry_run) { + if ((other_pool || write_fnode) && !dry_run) { dout(4) << "writing fnode to omap header" << dendl; bufferlist fnode_bl; lump.fnode.encode(fnode_bl); - r = io.omap_set_header(frag_oid.name, fnode_bl); + if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) { + r = output.omap_set_header(frag_oid.name, fnode_bl); + } if (r != 0) { derr << "Failed to write fnode for frag object " << frag_oid.name << dendl; @@ -707,7 +738,10 @@ int JournalTool::scavenge_dentries( // Perform bulk read of existing dentries std::map read_vals; - r = io.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + if (r == -ENOENT && other_pool) { + r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals); + } if (r != 0) { derr << "unexpected error reading fragment object " << frag_oid.name << ": " << cpp_strerror(r) << dendl; @@ -769,7 +803,7 @@ int JournalTool::scavenge_dentries( } } - if (write_dentry && !dry_run) { + if ((other_pool || write_dentry) && !dry_run) { dout(4) << "writing I dentry " << key << " into frag " << frag_oid.name << dendl; @@ -831,7 +865,7 @@ int JournalTool::scavenge_dentries( } } - if (write_dentry && !dry_run) { + if ((other_pool || write_dentry) && !dry_run) { dout(4) << "writing L dentry " << key << " into frag " << frag_oid.name << dendl; @@ -850,12 +884,12 @@ int JournalTool::scavenge_dentries( // Write back any new/changed dentries if (!write_vals.empty()) { - r = io.omap_set(frag_oid.name, write_vals); - if (r != 0) { - derr << "error writing dentries to " << frag_oid.name - << ": " << cpp_strerror(r) << dendl; - return r; - } + r = output.omap_set(frag_oid.name, write_vals); + if (r != 0) { + derr << "error writing dentries to " << frag_oid.name + << ": " << cpp_strerror(r) << dendl; + return r; + } } } @@ -876,7 +910,7 @@ int JournalTool::scavenge_dentries( bool write_root_ino = false; bufferlist old_root_ino_bl; - r = io.read(root_oid.name, old_root_ino_bl, (1<<22), 0); + r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0); if (r == -ENOENT) { dout(4) << "root does not exist, will create" << dendl; write_root_ino = true; @@ -915,7 +949,7 @@ int JournalTool::scavenge_dentries( encode_fullbit_as_inode(fb, false, &new_root_ino_bl); // Write to RADOS - r = io.write_full(root_oid.name, new_root_ino_bl); + r = output.write_full(root_oid.name, new_root_ino_bl); if (r != 0) { derr << "error writing inode object " << root_oid.name << ": " << cpp_strerror(r) << dendl; @@ -942,7 +976,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) dout(4) << "object id " << root_oid.name << dendl; bufferlist inode_bl; - r = io.read(root_oid.name, inode_bl, (1<<22), 0); + r = input.read(root_oid.name, inode_bl, (1<<22), 0); InodeStore inode; if (r == -ENOENT) { dout(4) << "root does not exist, will create" << dendl; @@ -977,7 +1011,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT); if (!dry_run) { - r = io.write_full(root_oid.name, inode_bl); + r = output.write_full(root_oid.name, inode_bl); assert(r == 0); } } @@ -997,7 +1031,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) // Check for presence of dirfrag object uint64_t psize; time_t pmtime; - r = io.stat(frag_object_id.name, &psize, &pmtime); + r = input.stat(frag_object_id.name, &psize, &pmtime); if (r == -ENOENT) { dout(4) << "Frag object " << frag_object_id.name << " did not exist, will create" << dendl; } else if (r != 0) { @@ -1011,7 +1045,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) bufferlist fnode_bl; lump.fnode.encode(fnode_bl); if (!dry_run) { - r = io.omap_set_header(frag_object_id.name, fnode_bl); + r = output.omap_set_header(frag_object_id.name, fnode_bl); if (r != 0) { derr << "Failed to write fnode for frag object " << frag_object_id.name << dendl; return r; @@ -1032,7 +1066,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) std::set keys; keys.insert(key); std::map vals; - r = io.omap_get_vals_by_keys(frag_object_id.name, keys, &vals); + r = input.omap_get_vals_by_keys(frag_object_id.name, keys, &vals); assert (r == 0); // I assume success because I checked object existed and absence of // dentry gives me empty map instead of failure // FIXME handle failures so we can replay other events @@ -1061,7 +1095,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) vals[key] = dentry_bl; if (!dry_run) { - r = io.omap_set(frag_object_id.name, vals); + r = output.omap_set(frag_object_id.name, vals); assert(r == 0); // FIXME handle failures } } @@ -1082,7 +1116,7 @@ int JournalTool::replay_offline(EMetaBlob const &metablob, bool const dry_run) std::set keys; keys.insert(key); if (!dry_run) { - r = io.omap_rm_keys(frag_object_id.name, keys); + r = output.omap_rm_keys(frag_object_id.name, keys); assert(r == 0); } } @@ -1151,7 +1185,7 @@ int JournalTool::erase_region(JournalScanner const &js, uint64_t const pos, uint uint32_t offset_in_obj = write_offset % object_size; uint32_t write_len = min(log_data.length(), object_size - offset_in_obj); - r = io.write(oid, log_data, write_len, offset_in_obj); + r = output.write(oid, log_data, write_len, offset_in_obj); if (r < 0) { return r; } else { @@ -1237,7 +1271,7 @@ int JournalTool::consume_inos(const std::set &inos) // Read object bufferlist inotable_bl; - int read_r = io.read(inotable_oid.name, inotable_bl, (1<<22), 0); + int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0); if (read_r < 0) { // Things are really bad if we can't read inotable. Beyond our powers. derr << "unable to read inotable '" << inotable_oid.name << "': " @@ -1273,7 +1307,7 @@ int JournalTool::consume_inos(const std::set &inos) bufferlist inotable_new_bl; ::encode(inotable_ver, inotable_new_bl); ino_table.encode_state(inotable_new_bl); - int write_r = io.write_full(inotable_oid.name, inotable_new_bl); + int write_r = output.write_full(inotable_oid.name, inotable_new_bl); if (write_r != 0) { derr << "error writing modified inotable " << inotable_oid.name << ": " << cpp_strerror(write_r) << dendl; diff --git a/src/tools/cephfs/JournalTool.h b/src/tools/cephfs/JournalTool.h index f8f90606fd09..f610255c2347 100644 --- a/src/tools/cephfs/JournalTool.h +++ b/src/tools/cephfs/JournalTool.h @@ -56,9 +56,13 @@ class JournalTool : public MDSUtility // I/O handles librados::Rados rados; - librados::IoCtx io; + librados::IoCtx input; + librados::IoCtx output; + + bool other_pool; // Metadata backing store manipulation + int read_lost_found(std::set &lost); int scavenge_dentries( EMetaBlob const &metablob, bool const dry_run, @@ -78,7 +82,7 @@ class JournalTool : public MDSUtility public: void usage(); JournalTool() : - rank(0) {} + rank(0), other_pool(false) {} int main(std::vector &argv); };