From: Yan, Zheng <zyan@redhat.com>
Date: Mon, 29 Oct 2018 03:03:00 +0000 (+0800)
Subject: mds: handle state change race
X-Git-Tag: v14.1.0~553^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d3a444473abc98e5ce8121af24538a141a292777;p=ceph.git

mds: handle state change race

In multi-mds cluster, recovering mds may receive mdsmap that changes
its state after other mds. Furthermore, the recovering mds may receive
messages tiggered by its state change from other mds before it receive
corresponding mdsmap.

Fixes: http://tracker.ceph.com/issues/37594
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 2bc6c4175df8..83e2134f577b 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2224,7 +2224,13 @@ void Locker::request_inode_file_caps(CInode *in)
 void Locker::handle_inode_file_caps(const MInodeFileCaps::const_ref &m)
 {
   // nobody should be talking to us during recovery.
-  ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    ceph_abort_msg("got unexpected message during recovery");
+  }
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 71a6c828e100..bbb168b0bea8 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4643,7 +4643,13 @@ void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &stron
   mds_rank_t from = mds_rank_t(strong->get_source().num());
 
   // only a recovering node will get a strong rejoin.
-  ceph_assert(mds->is_rejoin());
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    ceph_abort_msg("got unexpected rejoin message during recovery");
+  }
 
   // assimilate any potentially dirty scatterlock state
   for (const auto &p : strong->inode_scatterlocks) {
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 728b3301cec8..417a43a87dd1 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -1832,6 +1832,7 @@ void MDSRank::rejoin_start()
 {
   dout(1) << "rejoin_start" << dendl;
   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
 }
 void MDSRank::rejoin_done()
 {
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index 10a533a1e595..57e8749562fa 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -275,7 +275,8 @@ class MDSRank {
 
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
-    MDSInternalContextBase::vec waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    MDSInternalContextBase::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+				waiting_for_reconnect, waiting_for_resolve;
     MDSInternalContextBase::vec waiting_for_any_client_connection;
     MDSInternalContextBase::que replay_queue;
     map<mds_rank_t, MDSInternalContextBase::vec > waiting_for_active_peer;
@@ -408,6 +409,9 @@ class MDSRank {
     void wait_for_replay(MDSInternalContextBase *c) { 
       waiting_for_replay.push_back(c); 
     }
+    void wait_for_rejoin(MDSInternalContextBase *c) {
+      waiting_for_rejoin.push_back(c);
+    }
     void wait_for_reconnect(MDSInternalContextBase *c) {
       waiting_for_reconnect.push_back(c);
     }