From e296685e8f3f5158238216eefb76482bd6d55134 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Sep 2014 14:23:36 -0700 Subject: [PATCH] mon: re-bootstrap if we get probed by a mon that is way ahead During bootstrap we verify that our paxos commits overlap with the other mons we will form a quorum with. If they do not, we do a sync. However, it is possible we pass those checks, then fail to join a quorum before the quorum moves ahead in time such that we no longer overlap. Currently nothing kicks up back into a probing state to discover we need to sync... we will just keep trying to call or join an election instead. Fix this by jumping back to bootstrap if we get a probe that is ahead of us. Only do this from non probe or sync states as these will be common; it is only the active and electing states that matter (and probably just electing!). Fixes: #9301 Backport: giant, firefly Signed-off-by: Sage Weil (cherry picked from commit c421b55e8e15ef04ca8aeb47f7d090375eaa8573) --- src/mon/Monitor.cc | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index d799d7d819e14..ad35e5ed39c08 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -1408,6 +1408,8 @@ void Monitor::handle_probe(MMonProbe *m) */ void Monitor::handle_probe_probe(MMonProbe *m) { + MMonProbe *r; + dout(10) << "handle_probe_probe " << m->get_source_inst() << *m << " features " << m->get_connection()->get_features() << dendl; uint64_t missing = required_features & ~m->get_connection()->get_features(); @@ -1420,12 +1422,26 @@ void Monitor::handle_probe_probe(MMonProbe *m) m->required_features = required_features; messenger->send_message(r, m->get_connection()); } - m->put(); - return; + goto out; } - MMonProbe *r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, - name, has_ever_joined); + if (!is_probing() && !is_synchronizing()) { + // If the probing mon is way ahead of us, we need to re-bootstrap. + // Normally we capture this case when we initially bootstrap, but + // it is possible we pass those checks (we overlap with + // quorum-to-be) but fail to join a quorum before it moves past + // us. We need to be kicked back to bootstrap so we can + // synchonize, not keep calling elections. + if (paxos->get_version() + 1 < m->paxos_first_version) { + dout(1) << " peer " << m->get_source_addr() << " has first_committed " + << "ahead of us, re-bootstrapping" << dendl; + bootstrap(); + goto out; + + } + } + + r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined); r->name = name; r->quorum = quorum; monmap->encode(r->monmap_bl, m->get_connection()->get_features()); @@ -1440,6 +1456,7 @@ void Monitor::handle_probe_probe(MMonProbe *m) extra_probe_peers.insert(m->get_source_addr()); } + out: m->put(); } -- 2.39.5