From 3783653217b62ed432dfdfaef26ea23c5b117fd5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 8 May 2014 13:55:25 +0800 Subject: [PATCH] mds: include all of directory inode's replicas in rmdir witnesses If a MDS crashed after journaling a rmdir operation, but before sending MDentryUnlink messages. Survivor MDS may have incorrect linkage for the removed directory. Later when the MDS recovers, the incorrect linkage can cause survivor MDS crash. The fix is include all of directory inode's replicas in rmdir witnesses list. When receiving a rmdir slave request, MDS who has no auth subtree in the directory only need to update its cache and send reply (doesn't need to journal the slave request). Signed-off-by: Yan, Zheng --- src/mds/Mutation.h | 3 ++ src/mds/Server.cc | 72 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 17 deletions(-) diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index 2f192668104b0..e07089ddbc225 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -208,6 +208,8 @@ struct MDRequestImpl : public MutationImpl, public TrackedOp { // for rename/link/unlink set witnessed; // nodes who have journaled a RenamePrepare map pvmap; + + bool slave_update_journaled; // for rename set extra_witnesses; // replica list from srcdn auth (rename) @@ -242,6 +244,7 @@ struct MDRequestImpl : public MutationImpl, public TrackedOp { dirfrag_t fragment_base; More() : + slave_update_journaled(false), srcdn_auth_mds(-1), inode_import_v(0), rename_inode(0), is_freeze_authpin(false), is_ambiguous_auth(false), is_remote_frozen_authpin(false), is_inode_exporter(false), diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b0f0a13d1631d..2cc9726052856 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4869,14 +4869,7 @@ void Server::handle_client_unlink(MDRequestRef& mdr) if (in->is_dir() && in->has_subtree_root_dirfrag()) { // subtree root auths need to be witnesses set witnesses; - list ls; - in->get_subtree_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - int auth = dir->authority().first; - witnesses.insert(auth); - dout(10) << " need mds." << auth << " to witness for dirfrag " << *dir << dendl; - } + in->list_replicas(witnesses); dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; for (set::iterator p = witnesses.begin(); @@ -5126,6 +5119,35 @@ void Server::handle_slave_rmdir_prep(MDRequestRef& mdr) ::encode(rollback, mdr->more()->rollback_bl); dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl; + // set up commit waiter + mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr); + + if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) { + dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl; + dn->get_dir()->unlink_inode(dn); + straydn->get_dir()->link_primary_inode(straydn, in); + + assert(straydn->first >= in->first); + in->first = straydn->first; + + mdcache->adjust_subtree_after_rename(in, dn->get_dir(), false); + + MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, + MMDSSlaveRequest::OP_RMDIRPREPACK); + mds->send_message_mds(reply, mdr->slave_to_mds); + + // send caps to auth (if we're not already) + if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS)) + mdcache->migrator->export_caps(in); + + mdcache->touch_dentry_bottom(straydn); // move stray to end of lru + + mdr->slave_request->put(); + mdr->slave_request = 0; + mdr->straydn = 0; + return; + } + straydn->push_projected_linkage(in); dn->push_projected_linkage(); @@ -5143,9 +5165,7 @@ void Server::handle_slave_rmdir_prep(MDRequestRef& mdr) mds->mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir()); - // set up commit waiter - mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr); - + mdr->more()->slave_update_journaled = true; submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn), mdr, __func__); mdlog->flush(); @@ -5203,14 +5223,19 @@ void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r) dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl; if (r == 0) { - // write a commit to the journal - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid, mdr->slave_to_mds, - ESlaveUpdate::OP_COMMIT, ESlaveUpdate::RMDIR); - mdlog->start_entry(le); mdr->cleanup(); - submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__); - mdlog->flush(); + if (mdr->more()->slave_update_journaled) { + // write a commit to the journal + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid, + mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT, + ESlaveUpdate::RMDIR); + mdlog->start_entry(le); + submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__); + mdlog->flush(); + } else { + _committed_slave(mdr); + } } else { // abort do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr); @@ -5259,6 +5284,19 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequestRef& mdr) dout(10) << " straydn " << *dn << dendl; CInode *in = straydn->get_linkage()->get_inode(); + if (mdr && !mdr->more()->slave_update_journaled) { + assert(!in->has_subtree_root_dirfrag(mds->get_nodeid())); + + straydn->get_dir()->unlink_inode(straydn); + dn->get_dir()->link_primary_inode(dn, in); + + mdcache->adjust_subtree_after_rename(in, straydn->get_dir(), false); + + mds->mdcache->request_finish(mdr); + mds->mdcache->finish_rollback(rollback.reqid); + return; + } + dn->push_projected_linkage(in); straydn->push_projected_linkage(); -- 2.39.5