]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: set resolve/rejoin gather MDS set in advance
authorYan, Zheng <zheng.z.yan@intel.com>
Thu, 14 Mar 2013 16:08:39 +0000 (00:08 +0800)
committerGreg Farnum <greg@inktank.com>
Mon, 1 Apr 2013 16:17:19 +0000 (09:17 -0700)
For active MDS, it may receive resolve/rejoin message before receiving
the mdsmap message that claims the MDS cluster is in resolving/rejoning
state. So instead of set the gather MDS set when receiving the mdsmap.
set them in advance when detecting MDS' failure.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
src/mds/MDCache.cc
src/mds/MDCache.h

index e360465c2cbb4c2fa16ff6bf1705c4a2c7ccbb6c..d13b6d8c977a8b40fa4cef2413400083082fb520 100644 (file)
@@ -2436,18 +2436,17 @@ void MDCache::resolve_start()
     if (rootdir)
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
+  resolve_gather = recovery_set;
+  resolve_gather.erase(mds->get_nodeid());
+  rejoin_gather = resolve_gather;
 }
 
 void MDCache::send_resolves()
 {
-  // reset resolve state
-  got_resolve.clear();
-  other_ambiguous_imports.clear();
-
   send_slave_resolves();
   if (!resolve_ack_gather.empty()) {
     dout(10) << "send_resolves still waiting for resolve ack from ("
-             << need_resolve_ack << ")" << dendl;
+            << resolve_ack_gather << ")" << dendl;
     return;
   }
   if (!need_resolve_rollback.empty()) {
@@ -2499,7 +2498,7 @@ void MDCache::send_slave_resolves()
        ++p) {
     dout(10) << "sending slave resolve to mds." << p->first << dendl;
     mds->send_message_mds(p->second, p->first);
-    need_resolve_ack.insert(p->first);
+    resolve_ack_gather.insert(p->first);
   }
 }
 
@@ -2602,16 +2601,15 @@ void MDCache::handle_mds_failure(int who)
   recovery_set.erase(mds->get_nodeid());
   dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
 
-  // adjust my recovery lists
-  wants_resolve.erase(who);   // MDS will ask again
-  got_resolve.erase(who);     // i'll get another.
+  resolve_gather.insert(who);
   discard_delayed_resolve(who);
 
+  rejoin_gather.insert(who);
   rejoin_sent.erase(who);        // i need to send another
   rejoin_ack_gather.erase(who);  // i'll need/get another.
 
-  dout(10) << " wants_resolve " << wants_resolve << dendl;
-  dout(10) << " got_resolve " << got_resolve << dendl;
+  dout(10) << " resolve_gather " << resolve_gather << dendl;
+  dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
   dout(10) << " rejoin_sent " << rejoin_sent << dendl;
   dout(10) << " rejoin_gather " << rejoin_gather << dendl;
   dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
@@ -2792,7 +2790,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
     return;
   }
 
-  if (!need_resolve_ack.empty() || !need_resolve_rollback.empty()) {
+  if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) {
     dout(10) << "delay processing subtree resolve" << dendl;
     discard_delayed_resolve(from);
     delayed_resolve[from] = m;
@@ -2879,7 +2877,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
   }
   
   // did i get them all?
-  got_resolve.insert(from);
+  resolve_gather.erase(from);
   
   maybe_resolve_finish();
 
@@ -2905,12 +2903,12 @@ void MDCache::discard_delayed_resolve(int who)
 
 void MDCache::maybe_resolve_finish()
 {
-  assert(need_resolve_ack.empty());
+  assert(resolve_ack_gather.empty());
   assert(need_resolve_rollback.empty());
 
-  if (got_resolve != recovery_set) {
-    dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" 
-            << got_resolve << "), need (" << recovery_set << ")" << dendl;
+  if (!resolve_gather.empty()) {
+    dout(10) << "maybe_resolve_finish still waiting for resolves ("
+            << resolve_gather << ")" << dendl;
     return;
   } else {
     dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
@@ -2930,7 +2928,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
   dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
   int from = ack->get_source().num();
 
-  if (!need_resolve_ack.count(from)) {
+  if (!resolve_ack_gather.count(from)) {
     ack->put();
     return;
   }
@@ -3005,8 +3003,8 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
       assert(p->second->slave_to_mds != from);
   }
 
-  need_resolve_ack.erase(from);
-  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+  resolve_ack_gather.erase(from);
+  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
     send_subtree_resolves();
     process_delayed_resolve();
   }
@@ -3073,7 +3071,7 @@ void MDCache::finish_rollback(metareqid_t reqid) {
   if (mds->is_resolve())
     finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
   need_resolve_rollback.erase(reqid);
-  if (need_resolve_ack.empty() && need_resolve_rollback.empty()) {
+  if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) {
     send_subtree_resolves();
     process_delayed_resolve();
   }
@@ -3421,7 +3419,6 @@ void MDCache::rejoin_send_rejoins()
     if (*p == mds->get_nodeid())  continue;  // nothing to myself!
     if (rejoin_sent.count(*p)) continue;     // already sent a rejoin to this node!
     if (mds->is_rejoin()) {
-      rejoin_gather.insert(*p);
       rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK);
       rejoins[*p]->copy_cap_exports(cap_export_bl);
     } else if (mds->mdsmap->is_rejoin(*p))
index f646e91866de7dae3cbc550b51ae7bbb60b0091c..41cd6e7d8d8c00e2a031da8ab71675409f6c171b 100644 (file)
@@ -329,9 +329,8 @@ protected:
   friend class ECommitted;
 
   bool resolves_pending;
-  set<int> wants_resolve;   // nodes i need to send my resolve to
-  set<int> got_resolve;     // nodes i got resolves from
-  set<int> need_resolve_ack;   // nodes i need a resolve_ack from
+  set<int> resolve_gather;     // nodes i need resolves from
+  set<int> resolve_ack_gather; // nodes i need a resolve_ack from
   map<metareqid_t, int> need_resolve_rollback;  // rollbacks i'm writing to the journal
   map<int, MMDSResolve*> delayed_resolve;