From 7de9da4a43914082eea0a1f9c8b98ce285e873ef Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 11 Jul 2018 11:01:43 +0800 Subject: [PATCH] mds: handle discontinuous mdsmap There are two cases that a mds can get discontinuous mdsmap: - the mdsmap was sent by other mds - connection to monitor was reset Monitor does not preserve old version mdsmap. There is no easy way to ensure mds always gets continuous mdsmap. Instead, making mds handle discontinuous mdsmap is not difficult. When failover happens in multimds cluster, survivor mds may miss the mdsmap that indicates old mds failed and/or the mdsmap that indicates new mds started to replay. But the survivor mds always get the mdsmap that indicates the new mds entered resolve state. Fixes: http://tracker.ceph.com/issues/24856 Signed-off-by: "Yan, Zheng" --- src/mds/MDSMap.cc | 8 +-- src/mds/MDSRank.cc | 133 ++++++++++++++++++++++++++++----------------- 2 files changed, 86 insertions(+), 55 deletions(-) diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 9f985da69530f..446b59cb61651 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -898,12 +898,12 @@ bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next) state_valid = false; } } else if (prev == MDSMap::STATE_REJOIN) { - if (next != MDSMap::STATE_ACTIVE - && next != MDSMap::STATE_CLIENTREPLAY - && next != MDSMap::STATE_STOPPED) { + if (next != MDSMap::STATE_ACTIVE && + next != MDSMap::STATE_CLIENTREPLAY && + next != MDSMap::STATE_STOPPED) { state_valid = false; } - } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) { + } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) { // Once I have entered replay, the only allowable transitions are to // the next next along in the sequence. if (next != prev + 1) { diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 9d7b77dc2134e..ffb3b174ce83f 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1743,6 +1743,62 @@ void MDSRankDispatcher::handle_mds_map( if (g_conf()->mds_dump_cache_on_map) mdcache->dump_cache(); + // mdsmap and oldmap can be discontinuous. failover might happen in the missing mdsmap. + // the 'restart' set tracks ranks that have restarted since the old mdsmap + set restart; + // replaying mds does not communicate with other ranks + if (state >= MDSMap::STATE_RESOLVE) { + // did someone fail? + // new down? + set olddown, down; + oldmap->get_down_mds_set(&olddown); + mdsmap->get_down_mds_set(&down); + for (const auto& r : down) { + if (oldmap->have_inst(r) && olddown.count(r) == 0) { + messenger->mark_down_addrs(oldmap->get_addrs(r)); + handle_mds_failure(r); + } + } + + // did someone fail? + // did their addr/inst change? + set up; + mdsmap->get_up_mds_set(up); + for (const auto& r : up) { + auto& info = mdsmap->get_info(r); + if (oldmap->have_inst(r)) { + auto& oldinfo = oldmap->get_info(r); + if (info.inc != oldinfo.inc) { + messenger->mark_down_addrs(oldinfo.get_addrs()); + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + // -> stopped (missing) -> starting -> active + restart.insert(r); + mdcache->migrator->handle_mds_failure_or_stop(r); + if (mdsmap->get_tableserver() == whoami) + snapserver->handle_mds_failure_or_stop(r); + } + } + } else { + if (info.state == MDSMap::STATE_REPLAY || + info.state == MDSMap::STATE_RESOLVE) { + // -> starting/creating (missing) -> active (missing) -> replay -> resolve + restart.insert(r); + handle_mds_failure(r); + } else { + assert(info.state == MDSMap::STATE_CREATING || + info.state == MDSMap::STATE_STARTING || + info.state == MDSMap::STATE_ACTIVE); + } + } + } + } + // did it change? if (oldstate != state) { dout(1) << "handle_mds_map state change " @@ -1786,9 +1842,7 @@ void MDSRankDispatcher::handle_mds_map( // RESOLVE // is someone else newly resolving? - if (is_resolve() || is_reconnect() || is_rejoin() || - is_clientreplay() || is_active() || is_stopping()) { - + if (state >= MDSMap::STATE_RESOLVE) { // recover snaptable if (mdsmap->get_tableserver() == whoami) { if (oldstate < MDSMap::STATE_RESOLVE) { @@ -1799,16 +1853,17 @@ void MDSRankDispatcher::handle_mds_map( set old_set, new_set; oldmap->get_mds_set_lower_bound(old_set, MDSMap::STATE_RESOLVE); mdsmap->get_mds_set_lower_bound(new_set, MDSMap::STATE_RESOLVE); - for (auto p : new_set) { - if (p != whoami && // not me - old_set.count(p) == 0) { // newly so? - snapserver->handle_mds_recovery(p); + for (const auto& r : new_set) { + if (r == whoami) + continue; // not me + if (!old_set.count(r) || restart.count(r)) { // newly so? + snapserver->handle_mds_recovery(r); } } } } - if (!oldmap->is_resolving() && mdsmap->is_resolving()) { + if ((!oldmap->is_resolving() || !restart.empty()) && mdsmap->is_resolving()) { set resolve; mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); dout(10) << " resolve set is " << resolve << dendl; @@ -1819,7 +1874,7 @@ void MDSRankDispatcher::handle_mds_map( // REJOIN // is everybody finally rejoining? - if (is_rejoin() || is_clientreplay() || is_active() || is_stopping()) { + if (state >= MDSMap::STATE_REJOIN) { // did we start? if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) rejoin_joint_start(); @@ -1835,12 +1890,14 @@ void MDSRankDispatcher::handle_mds_map( set olddis, dis; oldmap->get_mds_set_lower_bound(olddis, MDSMap::STATE_REJOIN); mdsmap->get_mds_set_lower_bound(dis, MDSMap::STATE_REJOIN); - for (set::iterator p = dis.begin(); p != dis.end(); ++p) - if (*p != whoami && // not me - olddis.count(*p) == 0) { // newly so? - mdcache->kick_discovers(*p); - mdcache->kick_open_ino_peers(*p); + for (const auto& r : dis) { + if (r == whoami) + continue; // not me + if (!olddis.count(r) || restart.count(r)) { // newly so? + mdcache->kick_discovers(r); + mdcache->kick_open_ino_peers(r); } + } } } @@ -1855,42 +1912,16 @@ void MDSRankDispatcher::handle_mds_map( } // did someone go active? - if (oldstate >= MDSMap::STATE_CLIENTREPLAY && - (is_clientreplay() || is_active() || is_stopping())) { + if (state >= MDSMap::STATE_CLIENTREPLAY && + oldstate >= MDSMap::STATE_CLIENTREPLAY) { set oldactive, active; oldmap->get_mds_set_lower_bound(oldactive, MDSMap::STATE_CLIENTREPLAY); mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail? - // new down? - { - set olddown, down; - oldmap->get_down_mds_set(&olddown); - mdsmap->get_down_mds_set(&down); - for (set::iterator p = down.begin(); p != down.end(); ++p) { - if (oldmap->have_inst(*p) && olddown.count(*p) == 0) { - messenger->mark_down_addrs(oldmap->get_addrs(*p)); - handle_mds_failure(*p); - } - } - } - - // did someone fail? - // did their addr/inst change? - { - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (oldmap->have_inst(*p) && - oldmap->get_addrs(*p) != mdsmap->get_addrs(*p)) { - messenger->mark_down_addrs(oldmap->get_addrs(*p)); - handle_mds_failure(*p); - } + for (const auto& r : active) { + if (r == whoami) + continue; // not me + if (!oldactive.count(r) || restart.count(r)) // newly so? + handle_mds_recovery(r); } } @@ -1899,11 +1930,11 @@ void MDSRankDispatcher::handle_mds_map( set oldstopped, stopped; oldmap->get_stopped_mds_set(oldstopped); mdsmap->get_stopped_mds_set(stopped); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) { // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); + for (const auto& r : stopped) + if (oldstopped.count(r) == 0) { // newly so? + mdcache->migrator->handle_mds_failure_or_stop(r); if (mdsmap->get_tableserver() == whoami) - snapserver->handle_mds_failure_or_stop(*p); + snapserver->handle_mds_failure_or_stop(r); } } -- 2.39.5