From: Yan, Zheng Date: Wed, 11 Jul 2018 03:01:43 +0000 (+0800) Subject: mds: handle discontinuous mdsmap X-Git-Tag: v14.0.1~812^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=7de9da4a43914082eea0a1f9c8b98ce285e873ef;p=ceph-ci.git mds: handle discontinuous mdsmap There are two cases that a mds can get discontinuous mdsmap: - the mdsmap was sent by other mds - connection to monitor was reset Monitor does not preserve old version mdsmap. There is no easy way to ensure mds always gets continuous mdsmap. Instead, making mds handle discontinuous mdsmap is not difficult. When failover happens in multimds cluster, survivor mds may miss the mdsmap that indicates old mds failed and/or the mdsmap that indicates new mds started to replay. But the survivor mds always get the mdsmap that indicates the new mds entered resolve state. Fixes: http://tracker.ceph.com/issues/24856 Signed-off-by: "Yan, Zheng" --- diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 9f985da6953..446b59cb616 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -898,12 +898,12 @@ bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) state_valid = false; } } else if (prev == MDSMap::STATE_REJOIN) { - if (next != MDSMap::STATE_ACTIVE - && next != MDSMap::STATE_CLIENTREPLAY - && next != MDSMap::STATE_STOPPED) { + if (next != MDSMap::STATE_ACTIVE && + next != MDSMap::STATE_CLIENTREPLAY && + next != MDSMap::STATE_STOPPED) { state_valid = false; } - } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) { + } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) { // Once I have entered replay, the only allowable transitions are to // the next next along in the sequence. if (next != prev + 1) { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 9d7b77dc213..ffb3b174ce8 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1743,6 +1743,62 @@ void MDSRankDispatcher::handle_mds_map( if (g_conf()->mds_dump_cache_on_map) mdcache->dump_cache(); + // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap. + // the 'restart' set tracks ranks that have restarted since the old mdsmap + set restart; + // replaying mds does not communicate with other ranks + if (state >= MDSMap::STATE_RESOLVE) { + // did someone fail? + // new down? + set olddown, down; + oldmap->get_down_mds_set(&olddown); + mdsmap->get_down_mds_set(&down); + for (const auto& r : down) { + if (oldmap->have_inst(r) && olddown.count(r) == 0) { + messenger->mark_down_addrs(oldmap->get_addrs(r)); + handle_mds_failure(r); + } + } + + // did someone fail? + // did their addr/inst change? + set up; + mdsmap->get_up_mds_set(up); + for (const auto& r : up) { + auto& info = mdsmap->get_info(r); + if (oldmap->have_inst(r)) { + auto& oldinfo = oldmap->get_info(r); + if (info.inc != oldinfo.inc) { + messenger->mark_down_addrs(oldinfo.get_addrs()); + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + // -> stopped (missing) -> starting -> active + restart.insert(r); + mdcache->migrator->handle_mds_failure_or_stop(r); + if (mdsmap->get_tableserver() == whoami) + snapserver->handle_mds_failure_or_stop(r); + } + } + } else { + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + // -> starting/creating (missing) -> active (missing) -> replay -> resolve + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_CREATING || + info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + } + } + } + } + // did it change? if (oldstate != state) { dout(1) << "handle_mds_map state change " @@ -1786,9 +1842,7 @@ void MDSRankDispatcher::handle_mds_map( // RESOLVE // is someone else newly resolving? - if (is_resolve() || is_reconnect() || is_rejoin() || - is_clientreplay() || is_active() || is_stopping()) { - + if (state >= MDSMap::STATE_RESOLVE) { // recover snaptable if (mdsmap->get_tableserver() == whoami) { if (oldstate < MDSMap::STATE_RESOLVE) { @@ -1799,16 +1853,17 @@ void MDSRankDispatcher::handle_mds_map( set old_set, new_set; oldmap->get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE); mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE); - for (auto p : new_set) { - if (p != whoami && // not me - old_set.count(p) == 0) { // newly so? - snapserver->handle_mds_recovery(p); + for (const auto& r : new_set) { + if (r == whoami) + continue; // not me + if (!old_set.count(r) || restart.count(r)) { // newly so? + snapserver->handle_mds_recovery(r); } } } } - if (!oldmap->is_resolving() && mdsmap->is_resolving()) { + if ((!oldmap->is_resolving() || !restart.empty()) && mdsmap->is_resolving()) { set resolve; mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); dout(10) << " resolve set is " << resolve << dendl; @@ -1819,7 +1874,7 @@ void MDSRankDispatcher::handle_mds_map( // REJOIN // is everybody finally rejoining? - if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) { + if (state >= MDSMap::STATE_REJOIN) { // did we start? if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) rejoin_joint_start(); @@ -1835,12 +1890,14 @@ void MDSRankDispatcher::handle_mds_map( set olddis, dis; oldmap->get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN); mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN); - for (set::iterator p = dis.begin(); p != dis.end(); ++p) - if (*p != whoami && // not me - olddis.count(*p) == 0) { // newly so? - mdcache->kick_discovers(*p); - mdcache->kick_open_ino_peers(*p); + for (const auto& r : dis) { + if (r == whoami) + continue; // not me + if (!olddis.count(r) || restart.count(r)) { // newly so? + mdcache->kick_discovers(r); + mdcache->kick_open_ino_peers(r); } + } } } @@ -1855,42 +1912,16 @@ void MDSRankDispatcher::handle_mds_map( } // did someone go active? - if (oldstate >= MDSMap::STATE_CLIENTREPLAY && - (is_clientreplay() || is_active() || is_stopping())) { + if (state >= MDSMap::STATE_CLIENTREPLAY && + oldstate >= MDSMap::STATE_CLIENTREPLAY) { set oldactive, active; oldmap->get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY); mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail? - // new down? - { - set olddown, down; - oldmap->get_down_mds_set(&olddown); - mdsmap->get_down_mds_set(&down); - for (set::iterator p = down.begin(); p != down.end(); ++p) { - if (oldmap->have_inst(*p) && olddown.count(*p) == 0) { - messenger->mark_down_addrs(oldmap->get_addrs(*p)); - handle_mds_failure(*p); - } - } - } - - // did someone fail? - // did their addr/inst change? - { - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (oldmap->have_inst(*p) && - oldmap->get_addrs(*p) != mdsmap->get_addrs(*p)) { - messenger->mark_down_addrs(oldmap->get_addrs(*p)); - handle_mds_failure(*p); - } + for (const auto& r : active) { + if (r == whoami) + continue; // not me + if (!oldactive.count(r) || restart.count(r)) // newly so? + handle_mds_recovery(r); } } @@ -1899,11 +1930,11 @@ void MDSRankDispatcher::handle_mds_map( set oldstopped, stopped; oldmap->get_stopped_mds_set(oldstopped); mdsmap->get_stopped_mds_set(stopped); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) { // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); + for (const auto& r : stopped) + if (oldstopped.count(r) == 0) { // newly so? + mdcache->migrator->handle_mds_failure_or_stop(r); if (mdsmap->get_tableserver() == whoami) - snapserver->handle_mds_failure_or_stop(*p); + snapserver->handle_mds_failure_or_stop(r); } }