]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: fix resolve for surviving observers
authorSage Weil <sage@newdream.net>
Tue, 30 Nov 2010 23:43:53 +0000 (15:43 -0800)
committerSage Weil <sage@newdream.net>
Tue, 30 Nov 2010 23:43:53 +0000 (15:43 -0800)
Make all survivors participate in resolve stage, so that survivors can
properly determine the outcome of migrations to the failed node that did
not complete.

The sequence (before):
 - A starts to export /foo to B
 - C has ambiguous auth (A,B) in it's subtree map
 - B journals import_start
 - B fails
...
 - B restarts
 - B sends resolves to everyone
   - does not claim /foo
 - A sends resolve _only_ to B
   - does claim /foo
 - B knows it's import did not complete
 - C doesn't know anything.  Also, the maybe_resolve_finish stuff was
   totally broken because the recovery_set wasn't initialized

See new (commented out) assert in Migrator.cc to reproduce the above.

Signed-off-by: Sage Weil <sage@newdream.net>
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/MDS.cc
src/mds/MDS.h
src/mds/Migrator.cc

index 18cb74ee3f5c7a4eeb388159c4a3409a3f0d2ca2..2519953b476f4ac0f31639e3dc36119c90568eeb 100644 (file)
@@ -2155,23 +2155,26 @@ void MDCache::resolve_start()
     if (rootdir)
       adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
   }
+}
+
+void MDCache::send_resolves()
+{
+  // reset resolve state
+  got_resolve.clear();
+  other_ambiguous_imports.clear();
 
   for (set<int>::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) {
-    if (*p == mds->whoami)
+    int who = *p;
+    if (who == mds->whoami)
       continue;
-    send_resolve(*p);  // now.
+    if (migrator->is_importing() ||
+       migrator->is_exporting())
+      send_resolve_later(who);
+    else
+      send_resolve_now(who);
   }
 }
 
-void MDCache::send_resolve(int who)
-{
-  if (migrator->is_importing() || 
-      migrator->is_exporting())
-    send_resolve_later(who);
-  else
-    send_resolve_now(who);
-}
-
 void MDCache::send_resolve_later(int who)
 {
   dout(10) << "send_resolve_later to mds" << who << dendl;
@@ -2512,6 +2515,8 @@ void MDCache::handle_resolve(MMDSResolve *m)
   }    
 
   // update my dir_auth values
+  //   need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
+  //   migrations between other nodes)
   for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->subtrees.begin();
        pi != m->subtrees.end();
        ++pi) {
@@ -2523,17 +2528,16 @@ void MDCache::handle_resolve(MMDSResolve *m)
               << diri->dirfragtree 
               << " on " << pi->first << dendl;
     }
-
+    
     CDir *dir = diri->get_dirfrag(pi->first.frag);
     if (!dir) continue;
-
+    
     adjust_bounded_subtree_auth(dir, pi->second, from);
     try_subtree_merge(dir);
   }
 
   show_subtrees();
 
-
   // note ambiguous imports too
   for (map<dirfrag_t, vector<dirfrag_t> >::iterator pi = m->ambiguous_imports.begin();
        pi != m->ambiguous_imports.end();
index e0d8bc275d104a14e7fbe18307f06275ba786d17..e4931a7fae945d29c74af7f1cda763af9803e4c1 100644 (file)
@@ -693,7 +693,7 @@ public:
   void cancel_ambiguous_import(dirfrag_t dirino);
   void finish_ambiguous_import(dirfrag_t dirino);
   void resolve_start();
-  void send_resolve(int who);
+  void send_resolves();
   void send_resolve_now(int who);
   void send_resolve_later(int who);
   void maybe_send_pending_resolves();
index ce3760c97f35d01e004d21c3f6221bf682be556d..30dccd81a6b2ba25bfbc232168091bf556b87191 100644 (file)
@@ -923,11 +923,9 @@ void MDS::handle_mds_map(MMDSMap *m)
     oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE);
     mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE);
     if (oldresolve != resolve) {
-      dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl;
-      for (set<int>::iterator p = resolve.begin(); p != resolve.end(); ++p) 
-       if (*p != whoami &&
-           oldresolve.count(*p) == 0)
-         mdcache->send_resolve(*p);  // now or later.
+      dout(10) << " resolve set is " << resolve << ", was " << oldresolve << dendl;
+      calc_recovery_set();
+      mdcache->send_resolves();
     }
   }
   
@@ -1166,19 +1164,27 @@ void MDS::starting_done()
 }
 
 
-void MDS::replay_start()
+void MDS::calc_recovery_set()
 {
-  dout(1) << "replay_start" << dendl;
-  
   // initialize gather sets
   set<int> rs;
   mdsmap->get_recovery_mds_set(rs);
   rs.erase(whoami);
-  dout(1) << "now replay.  my recovery peers are " << rs
-         << ".  need osdmap epoch " << mdsmap->get_last_failure_osd_epoch()
+  mdcache->set_recovery_set(rs);
+
+  dout(1) << " recovery set is " << rs << dendl;
+}
+
+
+void MDS::replay_start()
+{
+  dout(1) << "replay_start" << dendl;
+  
+  calc_recovery_set();
+
+  dout(1) << " need osdmap epoch " << mdsmap->get_last_failure_osd_epoch()
          <<", have " << osdmap->get_epoch()
          << dendl;
-  mdcache->set_recovery_set(rs);
 
   // start?
   if (osdmap->get_epoch() >= mdsmap->get_last_failure_osd_epoch()) {
index 787211365b982cfc6b621bdb5f3792ba9d86c507..868bc78a008afd08632a6b2d7bf776f4370936b7 100644 (file)
@@ -347,6 +347,8 @@ class MDS : public Dispatcher {
   void boot_create();             // i am new mds.
   void boot_start(int step=0, int r=0);    // starting|replay
 
+  void calc_recovery_set();
+
   void replay_start();
   void creating_done();
   void starting_done();
index f2a0c072027fc9907238fb08e65f27542d0a726c..06c1224744d47ae1a116d731a806492949472017 100644 (file)
@@ -1993,6 +1993,10 @@ void Migrator::import_logged_start(CDir *dir, int from,
   
   // send notify's etc.
   dout(7) << "sending ack for " << *dir << " to old auth mds" << from << dendl;
+
+  // test surviving observer of a failed migration that did not complete
+  //assert(dir->replica_map.size() < 2 || mds->whoami != 0);
+
   mds->send_message_mds(new MExportDirAck(dir->dirfrag()), from);
   assert (g_conf.mds_kill_import_at != 8);