files in the filesystem and read their layouts, so the MDS must be
up and running.
+Using an alternate metadata pool for recovery
+---------------------------------------------
+
+.. warning::
+
+ There has not been extensive testing of this procedure. It should be
+ undertaken with great care.
+
+If an existing filesystem is damaged and inoperative, it is possible to create
+a fresh metadata pool and attempt to reconstruct the filesystem metadata
+into this new pool, leaving the old metadata in place. This could be used to
+make a safer attempt at recovery since the existing metadata pool would not be
+overwritten.
+
+.. caution::
+
+ During this process, multiple metadata pools will contain data referring to
+ the same data pool. Extreme caution must be exercised to avoid changing the
+ data pool contents while this is the case. Once recovery is complete, the
+ damaged metadata pool should be deleted.
+
+To begin this process, first create the fresh metadata pool and initialize
+it with empty file system data structures:
+
+::
+
+ ceph fs flag set enable_multiple true --yes-i-really-mean-it
+ ceph osd pool create recovery <pg-num> replicated <crush-ruleset-name>
+ ceph fs new recovery-fs recovery <data pool> --allow-dangerous-metadata-overlay
+ cephfs-data-scan init --force-init --filesystem recovery-fs --alternate-pool recovery
+ ceph fs reset recovery-fs --yes-i-realy-mean-it
+ cephfs-table-tool recovery-fs:all reset session
+ cephfs-table-tool recovery-fs:all reset snap
+ cephfs-table-tool recovery-fs:all reset inode
+
+Next, run the recovery toolset using the --alternate-pool argument to output
+results to the alternate pool:
+
+::
+
+ cephfs-data-scan scan_extents --alternate-pool recovery --filesystem <original filesystem name>
+ cephfs-data-scan scan_inodes --alternate-pool recovery --filesystem <original filesystem name> --force-corrupt --force-init <original data pool name>
+
+If the damaged filesystem contains dirty journal data, it may be recovered next
+with:
+
+::
+
+ cephfs-journal-tool --rank=<original filesystem name>:0 event recover_dentries list --alternate-pool recovery
+ cephfs-journal-tool --rank recovery-fs:0 journal reset --force
+
+After recovery, some recovered directories will have incorrect link counts.
+Ensure the parameter mds_debug_scatterstat is set to false (the default) to
+prevent the MDS from checking the link counts, then run a forward scrub to
+repair them. Ensure you have an MDS running and issue:
+
+::
+
+ ceph daemon mds.a scrub_path / recursive repair
}
fscid = fs->fscid;
return true;
+ } else if (arg == std::string("--alternate-pool")) {
+ metadata_pool_name = val;
+ return true;
} else {
return false;
}
std::string const &command = args[0];
std::string data_pool_name;
- std::string metadata_pool_name;
std::string pg_files_path;
std::set<pg_t> pg_files_pgs;
return r;
}
- r = driver->init(rados, fsmap, fscid);
+ r = driver->init(rados, metadata_pool_name, fsmap, fscid);
if (r < 0) {
return r;
}
int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
- std::string metadata_pool_name;
int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
if (r < 0) {
std::cerr << "Pool " << metadata_pool_id
uint64_t size;
time_t mtime;
int r = data_io.stat(oid, &size, &mtime);
+ dout(10) << "handling object " << obj_name_ino
+ << "." << obj_name_offset << dendl;
if (r != 0) {
dout(4) << "Cannot stat '" << oid << "': skipping" << dendl;
return r;
{
int r = 0;
+ dout(10) << "handling object "
+ << std::hex << obj_name_ino << "." << obj_name_offset << std::dec
+ << dendl;
+
AccumulateResult accum_res;
inode_backtrace_t backtrace;
file_layout_t loaded_layout = file_layout_t::get_default();
int MetadataDriver::init(
- librados::Rados &rados, const FSMap *fsmap, fs_cluster_id_t fscid)
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
{
- auto fs = fsmap->get_filesystem(fscid);
- assert(fs != nullptr);
- int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
+ if (metadata_pool_name.empty()) {
+ auto fs = fsmap->get_filesystem(fscid);
+ assert(fs != nullptr);
+ int64_t const metadata_pool_id = fs->mds_map.get_metadata_pool();
- dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
- std::string metadata_pool_name;
- int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
- if (r < 0) {
- derr << "Pool " << metadata_pool_id
- << " identified in MDS map not found in RADOS!" << dendl;
- return r;
+ dout(4) << "resolving metadata pool " << metadata_pool_id << dendl;
+ int r = rados.pool_reverse_lookup(metadata_pool_id, &metadata_pool_name);
+ if (r < 0) {
+ derr << "Pool " << metadata_pool_id
+ << " identified in MDS map not found in RADOS!" << dendl;
+ return r;
+ }
+ dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
+ } else {
+ dout(4) << "forcing metadata pool '" << metadata_pool_name << "'" << dendl;
}
- dout(4) << "found metadata pool '" << metadata_pool_name << "'" << dendl;
return rados.ioctx_create(metadata_pool_name.c_str(), metadata_io);
}
int LocalFileDriver::init(
- librados::Rados &rados, const FSMap *fsmap, fs_cluster_id_t fscid)
+ librados::Rados &rados, std::string &metadata_pool_name, const FSMap *fsmap,
+ fs_cluster_id_t fscid)
{
return 0;
}
<< " --inode=<integer>\n"
<< " --type=<UPDATE|OPEN|SESSION...><\n"
<< " --frag=<ino>.<frag> [--dname=<dentry string>]\n"
+ << " --alternate-pool=pool-name\n"
<< " --client=<session id integer>\n"
<< " <effect>: [get|apply|recover_dentries|splice]\n"
<< " <output>: [summary|list|binary|json] [--path <path>]\n"
<< "\n"
<< "Options:\n"
- << " --rank=<str> Journal rank (default 0)\n";
+ << " --rank=filesystem:mds-rank Journal rank (required if multiple\n"
+ << " file systems, default is rank 0 on\n"
+ << " the only filesystem otherwise.\n";
generic_client_usage();
}
r = role_selector.parse(*fsmap, rank_str);
if (r != 0) {
+ derr << "Couldn't determine MDS rank." << dendl;
return r;
}
}
dout(4) << "JournalTool: creating IoCtx.." << dendl;
- r = rados.ioctx_create(pool_name.c_str(), io);
+ r = rados.ioctx_create(pool_name.c_str(), input);
assert(r == 0);
+ output.dup(input);
// Execution
// =========
int JournalTool::main_header(std::vector<const char*> &argv)
{
JournalFilter filter;
- JournalScanner js(io, rank, filter);
+ JournalScanner js(input, rank, filter);
int r = js.scan(false);
if (r < 0) {
std::cerr << "Unable to scan journal" << std::endl;
dout(4) << "Writing object..." << dendl;
bufferlist header_bl;
::encode(*(js.header), header_bl);
- io.write_full(js.obj_name(0), header_bl);
+ output.write_full(js.obj_name(0), header_bl);
dout(4) << "Write complete." << dendl;
std::cout << "Successfully updated header." << std::endl;
} else {
std::string arg_str;
if (ceph_argparse_witharg(argv, arg, &arg_str, "--path", (char*)NULL)) {
output_path = arg_str;
+ } else if (ceph_argparse_witharg(argv, arg, &arg_str, "--alternate-pool",
+ nullptr)) {
+ dout(1) << "Using alternate pool " << arg_str << dendl;
+ int r = rados.ioctx_create(arg_str.c_str(), output);
+ assert(r == 0);
+ other_pool = true;
} else {
derr << "Unknown argument: '" << *arg << "'" << dendl;
usage();
// Execute command
// ===============
- JournalScanner js(io, rank, filter);
+ JournalScanner js(input, rank, filter);
if (command == "get") {
r = js.scan();
if (r) {
}
}
}
+
+ // Remove consumed dentries from lost+found.
+ if (other_pool && !dry_run) {
+ std::set<std::string> found;
+
+ for (auto i : consumed_inos) {
+ char s[20];
+
+ snprintf(s, sizeof(s), "%llx_head", (unsigned long long) i);
+ dout(20) << "removing " << s << dendl;
+ found.insert(std::string(s));
+ }
+
+ object_t frag_oid;
+ frag_oid = InodeStore::get_object_name(CEPH_INO_LOST_AND_FOUND,
+ frag_t(), "");
+ output.omap_rm_keys(frag_oid.name, found);
+ }
} else if (command == "splice") {
r = js.scan();
if (r) {
int r;
JournalFilter filter;
- JournalScanner js(io, rank, filter);
+ JournalScanner js(input, rank, filter);
r = js.scan();
if (r) {
std::cerr << "Failed to scan journal (" << cpp_strerror(r) << ")" << std::endl;
int JournalTool::journal_export(std::string const &path, bool import)
{
int r = 0;
- JournalScanner js(io, rank);
+ JournalScanner js(input, rank);
if (!import) {
/*
// Update fnode in omap header of dirfrag object
bool write_fnode = false;
bufferlist old_fnode_bl;
- r = io.omap_get_header(frag_oid.name, &old_fnode_bl);
+ r = input.omap_get_header(frag_oid.name, &old_fnode_bl);
if (r == -ENOENT) {
// Creating dirfrag from scratch
dout(4) << "failed to read OMAP header from directory fragment "
return r;
}
- if (write_fnode && !dry_run) {
+ if ((other_pool || write_fnode) && !dry_run) {
dout(4) << "writing fnode to omap header" << dendl;
bufferlist fnode_bl;
lump.fnode.encode(fnode_bl);
- r = io.omap_set_header(frag_oid.name, fnode_bl);
+ if (!other_pool || frag.ino >= MDS_INO_SYSTEM_BASE) {
+ r = output.omap_set_header(frag_oid.name, fnode_bl);
+ }
if (r != 0) {
derr << "Failed to write fnode for frag object "
<< frag_oid.name << dendl;
// Perform bulk read of existing dentries
std::map<std::string, bufferlist> read_vals;
- r = io.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ r = input.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ if (r == -ENOENT && other_pool) {
+ r = output.omap_get_vals_by_keys(frag_oid.name, read_keys, &read_vals);
+ }
if (r != 0) {
derr << "unexpected error reading fragment object "
<< frag_oid.name << ": " << cpp_strerror(r) << dendl;
}
}
- if (write_dentry && !dry_run) {
+ if ((other_pool || write_dentry) && !dry_run) {
dout(4) << "writing I dentry " << key << " into frag "
<< frag_oid.name << dendl;
}
}
- if (write_dentry && !dry_run) {
+ if ((other_pool || write_dentry) && !dry_run) {
dout(4) << "writing L dentry " << key << " into frag "
<< frag_oid.name << dendl;
// Write back any new/changed dentries
if (!write_vals.empty()) {
- r = io.omap_set(frag_oid.name, write_vals);
- if (r != 0) {
- derr << "error writing dentries to " << frag_oid.name
- << ": " << cpp_strerror(r) << dendl;
- return r;
- }
+ r = output.omap_set(frag_oid.name, write_vals);
+ if (r != 0) {
+ derr << "error writing dentries to " << frag_oid.name
+ << ": " << cpp_strerror(r) << dendl;
+ return r;
+ }
}
}
bool write_root_ino = false;
bufferlist old_root_ino_bl;
- r = io.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
+ r = input.read(root_oid.name, old_root_ino_bl, (1<<22), 0);
if (r == -ENOENT) {
dout(4) << "root does not exist, will create" << dendl;
write_root_ino = true;
encode_fullbit_as_inode(fb, false, &new_root_ino_bl);
// Write to RADOS
- r = io.write_full(root_oid.name, new_root_ino_bl);
+ r = output.write_full(root_oid.name, new_root_ino_bl);
if (r != 0) {
derr << "error writing inode object " << root_oid.name
<< ": " << cpp_strerror(r) << dendl;
dout(4) << "object id " << root_oid.name << dendl;
bufferlist inode_bl;
- r = io.read(root_oid.name, inode_bl, (1<<22), 0);
+ r = input.read(root_oid.name, inode_bl, (1<<22), 0);
InodeStore inode;
if (r == -ENOENT) {
dout(4) << "root does not exist, will create" << dendl;
inode.encode(inode_bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
if (!dry_run) {
- r = io.write_full(root_oid.name, inode_bl);
+ r = output.write_full(root_oid.name, inode_bl);
assert(r == 0);
}
}
// Check for presence of dirfrag object
uint64_t psize;
time_t pmtime;
- r = io.stat(frag_object_id.name, &psize, &pmtime);
+ r = input.stat(frag_object_id.name, &psize, &pmtime);
if (r == -ENOENT) {
dout(4) << "Frag object " << frag_object_id.name << " did not exist, will create" << dendl;
} else if (r != 0) {
bufferlist fnode_bl;
lump.fnode.encode(fnode_bl);
if (!dry_run) {
- r = io.omap_set_header(frag_object_id.name, fnode_bl);
+ r = output.omap_set_header(frag_object_id.name, fnode_bl);
if (r != 0) {
derr << "Failed to write fnode for frag object " << frag_object_id.name << dendl;
return r;
std::set<std::string> keys;
keys.insert(key);
std::map<std::string, bufferlist> vals;
- r = io.omap_get_vals_by_keys(frag_object_id.name, keys, &vals);
+ r = input.omap_get_vals_by_keys(frag_object_id.name, keys, &vals);
assert (r == 0); // I assume success because I checked object existed and absence of
// dentry gives me empty map instead of failure
// FIXME handle failures so we can replay other events
vals[key] = dentry_bl;
if (!dry_run) {
- r = io.omap_set(frag_object_id.name, vals);
+ r = output.omap_set(frag_object_id.name, vals);
assert(r == 0); // FIXME handle failures
}
}
std::set<std::string> keys;
keys.insert(key);
if (!dry_run) {
- r = io.omap_rm_keys(frag_object_id.name, keys);
+ r = output.omap_rm_keys(frag_object_id.name, keys);
assert(r == 0);
}
}
uint32_t offset_in_obj = write_offset % object_size;
uint32_t write_len = min(log_data.length(), object_size - offset_in_obj);
- r = io.write(oid, log_data, write_len, offset_in_obj);
+ r = output.write(oid, log_data, write_len, offset_in_obj);
if (r < 0) {
return r;
} else {
// Read object
bufferlist inotable_bl;
- int read_r = io.read(inotable_oid.name, inotable_bl, (1<<22), 0);
+ int read_r = input.read(inotable_oid.name, inotable_bl, (1<<22), 0);
if (read_r < 0) {
// Things are really bad if we can't read inotable. Beyond our powers.
derr << "unable to read inotable '" << inotable_oid.name << "': "
bufferlist inotable_new_bl;
::encode(inotable_ver, inotable_new_bl);
ino_table.encode_state(inotable_new_bl);
- int write_r = io.write_full(inotable_oid.name, inotable_new_bl);
+ int write_r = output.write_full(inotable_oid.name, inotable_new_bl);
if (write_r != 0) {
derr << "error writing modified inotable " << inotable_oid.name
<< ": " << cpp_strerror(write_r) << dendl;