From: Yan, Zheng Date: Tue, 30 May 2017 12:10:55 +0000 (+0800) Subject: mds: fix race between cross-authorty rename and mds failure X-Git-Tag: v12.1.0~128^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F15125%2Fhead;p=ceph.git mds: fix race between cross-authorty rename and mds failure If auth mds of rename srcdn fails, bystander mds may need to rollback corresponding slave rename. If corresponding slave rename is still work-in-progress, we should interrupt the slave rename, make it do nothing. Signed-off-by: "Yan, Zheng" --- diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index fd030e457aeca..e3ada0b1672cf 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2938,6 +2938,13 @@ void MDCache::handle_mds_failure(mds_rank_t who) dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; add_ambiguous_slave_update(p->first, mdr->slave_to_mds); } + } else if (mdr->slave_request) { + MMDSSlaveRequest *slave_req = mdr->slave_request; + // FIXME: Slave rename request can arrive after we notice mds failure. + // This can cause mds to crash (does not affect integrity of FS). + if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP && + slave_req->srcdn_auth == who) + slave_req->mark_interrupted(); } // failed node is slave? diff --git a/src/mds/MDSDaemon.h b/src/mds/MDSDaemon.h index 2abcd87fe0640..1c6a7d791a90d 100644 --- a/src/mds/MDSDaemon.h +++ b/src/mds/MDSDaemon.h @@ -40,7 +40,7 @@ #include "Beacon.h" -#define CEPH_MDS_PROTOCOL 29 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 30 /* cluster internal */ class MonClient; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 43ea584be1b1d..c330e170f2189 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6920,6 +6920,8 @@ bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, setdestdnpath.push_dentry(dn->name); if (straydn) mdcache->replicate_stray(straydn, who, req->stray); + + req->srcdn_auth = mdr->more()->srcdn_auth_mds; // srcdn auth will verify our current witness list is sufficient req->witnesses = witnesse; @@ -7470,7 +7472,17 @@ void Server::handle_slave_rename_prep(MDRequestRef& mdr) << " " << mdr->slave_request->srcdnpath << " to " << mdr->slave_request->destdnpath << dendl; - + + if (mdr->slave_request->is_interrupted()) { + dout(10) << " slave request interrupted, sending noop reply" << dendl; + MMDSSlaveRequest *reply= new MMDSSlaveRequest(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->mark_interrupted(); + mds->send_message_mds(reply, mdr->slave_to_mds); + mdr->slave_request->put(); + mdr->slave_request = 0; + return; + } + // discover destdn filepath destpath(mdr->slave_request->destdnpath); dout(10) << " dest " << destpath << dendl; @@ -8233,7 +8245,9 @@ void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, MMDSSlaveRequest *a // witnessed? or add extra witnesses? assert(mdr->more()->witnessed.count(from) == 0); - if (ack->witnesses.empty()) { + if (ack->is_interrupted()) { + dout(10) << " slave request interrupted, noop" << dendl; + } else if (ack->witnesses.empty()) { mdr->more()->witnessed.insert(from); if (!ack->is_not_journaled()) mdr->more()->has_journaled_slaves = true; diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h index 5512c5a0721e6..9e7510c9a6df8 100644 --- a/src/messages/MMDSSlaveRequest.h +++ b/src/messages/MMDSSlaveRequest.h @@ -101,6 +101,7 @@ class MMDSSlaveRequest : public Message { static const unsigned FLAG_NOTJOURNALED = 1<<2; static const unsigned FLAG_EROFS = 1<<3; static const unsigned FLAG_ABORT = 1<<4; + static const unsigned FLAG_INTERRUPTED = 1<<5; // for locking __u16 lock_type; // lock object type @@ -117,6 +118,7 @@ class MMDSSlaveRequest : public Message { bufferlist inode_export; version_t inode_export_v; bufferlist srci_replica; + mds_rank_t srcdn_auth; utime_t op_stamp; bufferlist stray; // stray dir + dentry @@ -142,6 +144,8 @@ public: bool is_error_rofs() { return (flags & FLAG_EROFS); } bool is_abort() { return (flags & FLAG_ABORT); } void mark_abort() { flags |= FLAG_ABORT; } + bool is_interrupted() { return (flags & FLAG_INTERRUPTED); } + void mark_interrupted() { flags |= FLAG_INTERRUPTED; } void set_lock_type(int t) { lock_type = t; } @@ -151,7 +155,7 @@ public: MMDSSlaveRequest(metareqid_t ri, __u32 att, int o) : Message(MSG_MDS_SLAVE_REQUEST), reqid(ri), attempt(att), op(o), flags(0), lock_type(0), - inode_export_v(0) { } + inode_export_v(0), srcdn_auth(MDS_RANK_NONE) { } private: ~MMDSSlaveRequest() override {} @@ -170,6 +174,7 @@ public: ::encode(op_stamp, payload); ::encode(inode_export, payload); ::encode(inode_export_v, payload); + ::encode(srcdn_auth, payload); ::encode(srci_replica, payload); ::encode(stray, payload); } @@ -188,6 +193,7 @@ public: ::decode(op_stamp, p); ::decode(inode_export, p); ::decode(inode_export_v, p); + ::decode(srcdn_auth, p); ::decode(srci_replica, p); ::decode(stray, p); }