]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: handle state change race 26005/head
authorYan, Zheng <zyan@redhat.com>
Mon, 29 Oct 2018 03:03:00 +0000 (11:03 +0800)
committerYan, Zheng <zyan@redhat.com>
Thu, 17 Jan 2019 13:39:55 +0000 (21:39 +0800)
In multi-mds cluster, recovering mds may receive mdsmap that changes
its state after other mds. Furthermore, the recovering mds may receive
messages tiggered by its state change from other mds before it receive
corresponding mdsmap.

Fixes: http://tracker.ceph.com/issues/37594
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
(cherry picked from commit d3a444473abc98e5ce8121af24538a141a292777)

 Conflicts:
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDSRank.h

src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDSRank.cc
src/mds/MDSRank.h

index 42b47087f24b3e923f5b195b369b0b18bf6b5363..f67047c8ab1a4fcc0ffbd39e11318da6e76e37bd 100644 (file)
@@ -2257,7 +2257,13 @@ void Locker::request_inode_file_caps(CInode *in)
 void Locker::handle_inode_file_caps(MInodeFileCaps *m)
 {
   // nobody should be talking to us during recovery.
-  assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
+  if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
+    if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
+      mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
+      return;
+    }
+    assert(!"got unexpected message during recovery");
+  }
 
   // ok
   CInode *in = mdcache->get_inode(m->get_ino());
index d7b40a3126f6522e7bfbf3a924ae1b114ddc69c4..e07df26063fac5df1cf1cc054ac9c6f3bf0a97ee 100644 (file)
@@ -4643,7 +4643,13 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
   mds_rank_t from = mds_rank_t(strong->get_source().num());
 
   // only a recovering node will get a strong rejoin.
-  assert(mds->is_rejoin());
+  if (!mds->is_rejoin()) {
+    if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
+      mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
+      return;
+    }
+    assert(!"got unexpected rejoin message during recovery");
+  }
 
   // assimilate any potentially dirty scatterlock state
   for (map<inodeno_t,MMDSCacheRejoin::lock_bls>::iterator p = strong->inode_scatterlocks.begin();
index 3e3ac1d0968c3794fb0777359f20d76a0a0a6cf8..e76d1c79a6ecbcb0ed9eee5bf92e100057e5246b 100644 (file)
@@ -1817,6 +1817,7 @@ void MDSRank::rejoin_start()
 {
   dout(1) << "rejoin_start" << dendl;
   mdcache->rejoin_start(new C_MDS_VoidFn(this, &MDSRank::rejoin_done));
+  finish_contexts(g_ceph_context, waiting_for_rejoin);
 }
 void MDSRank::rejoin_done()
 {
index 46f72a4b964bb1644f0daa030954661ddbc390a9..6be660d10655e8599a36394773a41019b13714d8 100644 (file)
@@ -269,7 +269,8 @@ class MDSRank {
 
     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 
-    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
+    list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_rejoin,
+                                 waiting_for_reconnect, waiting_for_resolve;
     list<MDSInternalContextBase*> waiting_for_any_client_connection;
     list<MDSInternalContextBase*> replay_queue;
     map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
@@ -409,6 +410,9 @@ class MDSRank {
     void wait_for_replay(MDSInternalContextBase *c) { 
       waiting_for_replay.push_back(c); 
     }
+    void wait_for_rejoin(MDSInternalContextBase *c) {
+      waiting_for_rejoin.push_back(c);
+    }
     void wait_for_reconnect(MDSInternalContextBase *c) {
       waiting_for_reconnect.push_back(c);
     }