From: Sage Weil Date: Tue, 30 Nov 2010 23:43:53 +0000 (-0800) Subject: mds: fix resolve for surviving observers X-Git-Tag: v0.24~90 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=08bd4eadd29d84081b4e521aec735315e33077a0;p=ceph.git mds: fix resolve for surviving observers Make all survivors participate in resolve stage, so that survivors can properly determine the outcome of migrations to the failed node that did not complete. The sequence (before): - A starts to export /foo to B - C has ambiguous auth (A,B) in it's subtree map - B journals import_start - B fails ... - B restarts - B sends resolves to everyone - does not claim /foo - A sends resolve _only_ to B - does claim /foo - B knows it's import did not complete - C doesn't know anything. Also, the maybe_resolve_finish stuff was totally broken because the recovery_set wasn't initialized See new (commented out) assert in Migrator.cc to reproduce the above. Signed-off-by: Sage Weil --- diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 18cb74ee3f5c7..2519953b476f4 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2155,23 +2155,26 @@ void MDCache::resolve_start() if (rootdir) adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); } +} + +void MDCache::send_resolves() +{ + // reset resolve state + got_resolve.clear(); + other_ambiguous_imports.clear(); for (set::iterator p = recovery_set.begin(); p != recovery_set.end(); ++p) { - if (*p == mds->whoami) + int who = *p; + if (who == mds->whoami) continue; - send_resolve(*p); // now. + if (migrator->is_importing() || + migrator->is_exporting()) + send_resolve_later(who); + else + send_resolve_now(who); } } -void MDCache::send_resolve(int who) -{ - if (migrator->is_importing() || - migrator->is_exporting()) - send_resolve_later(who); - else - send_resolve_now(who); -} - void MDCache::send_resolve_later(int who) { dout(10) << "send_resolve_later to mds" << who << dendl; @@ -2512,6 +2515,8 @@ void MDCache::handle_resolve(MMDSResolve *m) } // update my dir_auth values + // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous + // migrations between other nodes) for (map >::iterator pi = m->subtrees.begin(); pi != m->subtrees.end(); ++pi) { @@ -2523,17 +2528,16 @@ void MDCache::handle_resolve(MMDSResolve *m) << diri->dirfragtree << " on " << pi->first << dendl; } - + CDir *dir = diri->get_dirfrag(pi->first.frag); if (!dir) continue; - + adjust_bounded_subtree_auth(dir, pi->second, from); try_subtree_merge(dir); } show_subtrees(); - // note ambiguous imports too for (map >::iterator pi = m->ambiguous_imports.begin(); pi != m->ambiguous_imports.end(); diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index e0d8bc275d104..e4931a7fae945 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -693,7 +693,7 @@ public: void cancel_ambiguous_import(dirfrag_t dirino); void finish_ambiguous_import(dirfrag_t dirino); void resolve_start(); - void send_resolve(int who); + void send_resolves(); void send_resolve_now(int who); void send_resolve_later(int who); void maybe_send_pending_resolves(); diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index ce3760c97f35d..30dccd81a6b2b 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -923,11 +923,9 @@ void MDS::handle_mds_map(MMDSMap *m) oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); if (oldresolve != resolve) { - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) - if (*p != whoami && - oldresolve.count(*p) == 0) - mdcache->send_resolve(*p); // now or later. + dout(10) << " resolve set is " << resolve << ", was " << oldresolve << dendl; + calc_recovery_set(); + mdcache->send_resolves(); } } @@ -1166,19 +1164,27 @@ void MDS::starting_done() } -void MDS::replay_start() +void MDS::calc_recovery_set() { - dout(1) << "replay_start" << dendl; - // initialize gather sets set rs; mdsmap->get_recovery_mds_set(rs); rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs - << ". need osdmap epoch " << mdsmap->get_last_failure_osd_epoch() + mdcache->set_recovery_set(rs); + + dout(1) << " recovery set is " << rs << dendl; +} + + +void MDS::replay_start() +{ + dout(1) << "replay_start" << dendl; + + calc_recovery_set(); + + dout(1) << " need osdmap epoch " << mdsmap->get_last_failure_osd_epoch() <<", have " << osdmap->get_epoch() << dendl; - mdcache->set_recovery_set(rs); // start? if (osdmap->get_epoch() >= mdsmap->get_last_failure_osd_epoch()) { diff --git a/src/mds/MDS.h b/src/mds/MDS.h index 787211365b982..868bc78a008af 100644 --- a/src/mds/MDS.h +++ b/src/mds/MDS.h @@ -347,6 +347,8 @@ class MDS : public Dispatcher { void boot_create(); // i am new mds. void boot_start(int step=0, int r=0); // starting|replay + void calc_recovery_set(); + void replay_start(); void creating_done(); void starting_done(); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index f2a0c072027fc..06c1224744d47 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1993,6 +1993,10 @@ void Migrator::import_logged_start(CDir *dir, int from, // send notify's etc. dout(7) << "sending ack for " << *dir << " to old auth mds" << from << dendl; + + // test surviving observer of a failed migration that did not complete + //assert(dir->replica_map.size() < 2 || mds->whoami != 0); + mds->send_message_mds(new MExportDirAck(dir->dirfrag()), from); assert (g_conf.mds_kill_import_at != 8);