From a1842eb262a16215b82d22e78c7978bd35874512 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 30 May 2017 20:10:55 +0800 Subject: [PATCH] mds: fix race between cross-authorty rename and mds failure If auth mds of rename srcdn fails, bystander mds may need to rollback corresponding slave rename. If corresponding slave rename is still work-in-progress, we should interrupt the slave rename, make it do nothing. Signed-off-by: "Yan, Zheng" --- src/mds/MDCache.cc | 7 +++++++ src/mds/MDSDaemon.h | 2 +- src/mds/Server.cc | 18 ++++++++++++++++-- src/messages/MMDSSlaveRequest.h | 8 +++++++- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index fd030e457aeca..e3ada0b1672cf 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2938,6 +2938,13 @@ void MDCache::handle_mds_failure(mds_rank_t who) dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; add_ambiguous_slave_update(p->first, mdr->slave_to_mds); } + } else if (mdr->slave_request) { + MMDSSlaveRequest *slave_req = mdr->slave_request; + // FIXME: Slave rename request can arrive after we notice mds failure. + // This can cause mds to crash (does not affect integrity of FS). + if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP && + slave_req->srcdn_auth == who) + slave_req->mark_interrupted(); } // failed node is slave? diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h index 2abcd87fe0640..1c6a7d791a90d 100644 --- a/src/mds/MDSDaemon.h +++ b/src/mds/MDSDaemon.h @@ -40,7 +40,7 @@ #include "Beacon.h" -#define CEPH_MDS_PROTOCOL 29 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 30 /* cluster internal */ class MonClient; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 43ea584be1b1d..c330e170f2189 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6920,6 +6920,8 @@ bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, setdestdnpath.push_dentry(dn->name); if (straydn) mdcache->replicate_stray(straydn, who, req->stray); + + req->srcdn_auth = mdr->more()->srcdn_auth_mds; // srcdn auth will verify our current witness list is sufficient req->witnesses = witnesse; @@ -7470,7 +7472,17 @@ void Server::handle_slave_rename_prep(MDRequestRef& mdr) << " " << mdr->slave_request->srcdnpath << " to " << mdr->slave_request->destdnpath << dendl; - + + if (mdr->slave_request->is_interrupted()) { + dout(10) << " slave request interrupted, sending noop reply" << dendl; + MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->mark_interrupted(); + mds->send_message_mds(reply, mdr->slave_to_mds); + mdr->slave_request->put(); + mdr->slave_request = 0; + return; + } + // discover destdn filepath destpath(mdr->slave_request->destdnpath); dout(10) << " dest " << destpath << dendl; @@ -8233,7 +8245,9 @@ void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *a // witnessed? or add extra witnesses? assert(mdr->more()->witnessed.count(from) == 0); - if (ack->witnesses.empty()) { + if (ack->is_interrupted()) { + dout(10) << " slave request interrupted, noop" << dendl; + } else if (ack->witnesses.empty()) { mdr->more()->witnessed.insert(from); if (!ack->is_not_journaled()) mdr->more()->has_journaled_slaves = true; diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h index 5512c5a0721e6..9e7510c9a6df8 100644 --- a/src/messages/MMDSSlaveRequest.h +++ b/src/messages/MMDSSlaveRequest.h @@ -101,6 +101,7 @@ class MMDSSlaveRequest : public Message { static const unsigned FLAG_NOTJOURNALED = 1<<2; static const unsigned FLAG_EROFS = 1<<3; static const unsigned FLAG_ABORT = 1<<4; + static const unsigned FLAG_INTERRUPTED = 1<<5; // for locking __u16 lock_type; // lock object type @@ -117,6 +118,7 @@ class MMDSSlaveRequest : public Message { bufferlist inode_export; version_t inode_export_v; bufferlist srci_replica; + mds_rank_t srcdn_auth; utime_t op_stamp; bufferlist stray; // stray dir + dentry @@ -142,6 +144,8 @@ public: bool is_error_rofs() { return (flags & FLAG_EROFS); } bool is_abort() { return (flags & FLAG_ABORT); } void mark_abort() { flags |= FLAG_ABORT; } + bool is_interrupted() { return (flags & FLAG_INTERRUPTED); } + void mark_interrupted() { flags |= FLAG_INTERRUPTED; } void set_lock_type(int t) { lock_type = t; } @@ -151,7 +155,7 @@ public: MMDSSlaveRequest(metareqid_t ri, __u32 att, int o) : Message(MSG_MDS_SLAVE_REQUEST), reqid(ri), attempt(att), op(o), flags(0), lock_type(0), - inode_export_v(0) { } + inode_export_v(0), srcdn_auth(MDS_RANK_NONE) { } private: ~MMDSSlaveRequest() override {} @@ -170,6 +174,7 @@ public: ::encode(op_stamp, payload); ::encode(inode_export, payload); ::encode(inode_export_v, payload); + ::encode(srcdn_auth, payload); ::encode(srci_replica, payload); ::encode(stray, payload); } @@ -188,6 +193,7 @@ public: ::decode(op_stamp, p); ::decode(inode_export, p); ::decode(inode_export_v, p); + ::decode(srcdn_auth, p); ::decode(srci_replica, p); ::decode(stray, p); } -- 2.47.3