From 299fcd95ae21f66de5a1e3c5d8529a7f49e7a0f1 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 11 Jul 2018 11:01:43 +0800 Subject: [PATCH] mds: handle discontinuous mdsmap There are two cases that a mds can get discontinuous mdsmap: - the mdsmap was sent by other mds - connection to monitor was reset Monitor does not preserve old version mdsmap. There is no easy way to ensure mds always gets continuous mdsmap. Instead, making mds handle discontinuous mdsmap is not difficult. When failover happens in multimds cluster, survivor mds may miss the mdsmap that indicates old mds failed and/or the mdsmap that indicates new mds started to replay. But the survivor mds always get the mdsmap that indicates the new mds entered resolve state. Fixes: http://tracker.ceph.com/issues/24856 Signed-off-by: "Yan, Zheng" (cherry picked from commit 7de9da4a43914082eea0a1f9c8b98ce285e873ef) Conflicts: src/mds/MDSRank.cc --- src/mds/MDSMap.cc | 8 +-- src/mds/MDSRank.cc | 121 ++++++++++++++++++++++++++++----------------- 2 files changed, 79 insertions(+), 50 deletions(-) diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 9dfce950f5a8d..f9d1c87ce2028 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -822,12 +822,12 @@ bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) state_valid = false; } } else if (prev == MDSMap::STATE_REJOIN) { - if (next != MDSMap::STATE_ACTIVE - && next != MDSMap::STATE_CLIENTREPLAY - && next != MDSMap::STATE_STOPPED) { + if (next != MDSMap::STATE_ACTIVE && + next != MDSMap::STATE_CLIENTREPLAY && + next != MDSMap::STATE_STOPPED) { state_valid = false; } - } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) { + } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) { // Once I have entered replay, the only allowable transitions are to // the next next along in the sequence. if (next != prev + 1) { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 1da6f2db7d835..da67e93b93375 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1650,6 +1650,60 @@ void MDSRankDispatcher::handle_mds_map( if (g_conf->mds_dump_cache_on_map) mdcache->dump_cache(); + // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap. + // the 'restart' set tracks ranks that have restarted since the old mdsmap + set restart; + // replaying mds does not communicate with other ranks + if (state >= MDSMap::STATE_RESOLVE) { + // did someone fail? + // new down? + set olddown, down; + oldmap->get_down_mds_set(&olddown); + mdsmap->get_down_mds_set(&down); + for (const auto& r : down) { + if (oldmap->have_inst(r) && olddown.count(r) == 0) { + messenger->mark_down(oldmap->get_inst(r).addr); + handle_mds_failure(r); + } + } + + // did someone fail? + // did their addr/inst change? + set up; + mdsmap->get_up_mds_set(up); + for (const auto& r : up) { + auto& info = mdsmap->get_info(r); + if (oldmap->have_inst(r)) { + auto& oldinfo = oldmap->get_info(r); + if (info.inc != oldinfo.inc) { + messenger->mark_down(oldinfo.addr); + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + // -> stopped (missing) -> starting -> active + restart.insert(r); + mdcache->migrator->handle_mds_failure_or_stop(r); + } + } + } else { + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + // -> starting/creating (missing) -> active (missing) -> replay -> resolve + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_CREATING || + info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + } + } + } + } + // did it change? if (oldstate != state) { dout(1) << "handle_mds_map state change " @@ -1693,9 +1747,8 @@ void MDSRankDispatcher::handle_mds_map( // RESOLVE // is someone else newly resolving? - if (is_resolve() || is_reconnect() || is_rejoin() || - is_clientreplay() || is_active() || is_stopping()) { - if (!oldmap->is_resolving() && mdsmap->is_resolving()) { + if (state >= MDSMap::STATE_RESOLVE) { + if ((!oldmap->is_resolving() || !restart.empty()) && mdsmap->is_resolving()) { set resolve; mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); dout(10) << " resolve set is " << resolve << dendl; @@ -1706,7 +1759,7 @@ void MDSRankDispatcher::handle_mds_map( // REJOIN // is everybody finally rejoining? - if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) { + if (state >= MDSMap::STATE_REJOIN) { // did we start? if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) rejoin_joint_start(); @@ -1722,12 +1775,14 @@ void MDSRankDispatcher::handle_mds_map( set olddis, dis; oldmap->get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN); mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN); - for (set::iterator p = dis.begin(); p != dis.end(); ++p) - if (*p != whoami && // not me - olddis.count(*p) == 0) { // newly so? - mdcache->kick_discovers(*p); - mdcache->kick_open_ino_peers(*p); + for (const auto& r : dis) { + if (r == whoami) + continue; // not me + if (!olddis.count(r) || restart.count(r)) { // newly so? + mdcache->kick_discovers(r); + mdcache->kick_open_ino_peers(r); } + } } } @@ -1742,53 +1797,27 @@ void MDSRankDispatcher::handle_mds_map( } // did someone go active? - if (oldstate >= MDSMap::STATE_CLIENTREPLAY && - (is_clientreplay() || is_active() || is_stopping())) { + if (state >= MDSMap::STATE_CLIENTREPLAY && + oldstate >= MDSMap::STATE_CLIENTREPLAY) { set oldactive, active; oldmap->get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY); mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail? - // new down? - { - set olddown, down; - oldmap->get_down_mds_set(&olddown); - mdsmap->get_down_mds_set(&down); - for (set::iterator p = down.begin(); p != down.end(); ++p) { - if (oldmap->have_inst(*p) && olddown.count(*p) == 0) { - messenger->mark_down(oldmap->get_inst(*p).addr); - handle_mds_failure(*p); - } + for (const auto& r : active) { + if (r == whoami) + continue; // not me + if (!oldactive.count(r) || restart.count(r)) // newly so? + handle_mds_recovery(r); } } - // did someone fail? - // did their addr/inst change? - { - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (oldmap->have_inst(*p) && - oldmap->get_inst(*p) != mdsmap->get_inst(*p)) { - messenger->mark_down(oldmap->get_inst(*p).addr); - handle_mds_failure(*p); - } - } - } - - if (is_clientreplay() || is_active() || is_stopping()) { + if (state >= MDSMap::STATE_CLIENTREPLAY) { // did anyone stop? set oldstopped, stopped; oldmap->get_stopped_mds_set(oldstopped); mdsmap->get_stopped_mds_set(stopped); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); + for (const auto& r : stopped) + if (oldstopped.count(r) == 0) // newly so? + mdcache->migrator->handle_mds_failure_or_stop(r); } { -- 2.39.5