]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: handle state change race 24797/head
authorYan, Zheng <zyan@redhat.com>
Mon, 29 Oct 2018 03:03:00 +0000 (11:03 +0800)
committerYan, Zheng <zyan@redhat.com>
Tue, 11 Dec 2018 12:21:24 +0000 (20:21 +0800)
In multi-mds cluster, recovering mds may receive mdsmap that changes
its state after other mds. Furthermore, the recovering mds may receive
messages tiggered by its state change from other mds before it receive
corresponding mdsmap.

Fixes: http://tracker.ceph.com/issues/37594
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDSRank.cc
src/mds/MDSRank.h

index 2bc6c4175df825105b5a7439b8c239fd3b2c188a..83e2134f577b781a94eda0604679ddc8de3ee9c4 100644 (file)
@@ -2224,7 +2224,13 @@ void Locker::request_inode_file_caps(CInode *in)
 void Locker::handle_inode_file_caps(const MInodeFileCaps::const_ref &m)
 {
   // nobody should be talking to us during recovery.
-  ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    ceph_abort_msg("got unexpected message during recovery");
+  }
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
index 71a6c828e10049a4f660989b4ec734ef150aa7c7..bbb168b0bea86b8cb49a41c079cfcef3a7ba1546 100644 (file)
@@ -4643,7 +4643,13 @@ void MDCache::handle_cache_rejoin_strong(const MMDSCacheRejoin::const_ref &stron
   mds_rank_t from = mds_rank_t(strong->get_source().num());
 
   // only a recovering node will get a strong rejoin.
-  ceph_assert(mds->is_rejoin());
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    ceph_abort_msg("got unexpected rejoin message during recovery");
+  }
 
   // assimilate any potentially dirty scatterlock state
   for (const auto &p : strong->inode_scatterlocks) {
index 728b3301cec817a6373a55dfe23b57a154bae563..417a43a87dd15f9d0b8444b571214938e88ec019 100644 (file)
@@ -1832,6 +1832,7 @@ void MDSRank::rejoin_start()
 {
   dout(1) << "rejoin_start" << dendl;
   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
 }
 void MDSRank::rejoin_done()
 {
index 10a533a1e595788a91b8e268f31b7a864d0d4569..57e8749562fafca867cd77764bafb4dd9d1b9f3d 100644 (file)
@@ -275,7 +275,8 @@ class MDSRank {
 
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
-    MDSInternalContextBase::vec waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    MDSInternalContextBase::vec waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+                               waiting_for_reconnect, waiting_for_resolve;
     MDSInternalContextBase::vec waiting_for_any_client_connection;
     MDSInternalContextBase::que replay_queue;
     map<mds_rank_t, MDSInternalContextBase::vec > waiting_for_active_peer;
@@ -408,6 +409,9 @@ class MDSRank {
     void wait_for_replay(MDSInternalContextBase *c) { 
       waiting_for_replay.push_back(c); 
     }
+    void wait_for_rejoin(MDSInternalContextBase *c) {
+      waiting_for_rejoin.push_back(c);
+    }
     void wait_for_reconnect(MDSInternalContextBase *c) {
       waiting_for_reconnect.push_back(c);
     }