From fcaabf1a22723c571c10d402464071c6405607c0 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 24 Apr 2013 15:36:41 -0700 Subject: [PATCH] mon: when electing, be sure acked leaders have new enough stores to lead In general anybody participating in an election should be new enough to lead thanks to the bootstrap process, but we've observed situations in which a monitor is leader but gets so busy that it gets booted out without noticing for a while, then processes the election messages which were spawned, responds to them, and the other monitors kick those up to a new election epoch. Then the old and behind monitor gets elected as the new leader, which does bad things to our sync. To deal with this, add the paxos first and last committed versions to the MMonElection messages, and consider those values when deciding whether to defer to a peer. Only defer to them if their newest value is newer than our oldest, but also *do* defer to them if their oldest value is newer than our newest even if we out-rank them otherwise. Signed-off-by: Greg Farnum --- src/messages/MMonElection.h | 23 +++++++++++++++++++---- src/mon/Elector.cc | 34 +++++++++++++++++++++++++--------- src/mon/Elector.h | 14 ++++++++++---- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/messages/MMonElection.h b/src/messages/MMonElection.h index b589d9842f1c..9771f6123d60 100644 --- a/src/messages/MMonElection.h +++ b/src/messages/MMonElection.h @@ -21,7 +21,7 @@ class MMonElection : public Message { - static const int HEAD_VERSION = 3; + static const int HEAD_VERSION = 4; static const int COMPAT_VERSION = 2; public: @@ -45,11 +45,20 @@ public: bufferlist monmap_bl; set quorum; uint64_t quorum_features; + version_t paxos_first_version; + version_t paxos_last_version; - MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION) { } - MMonElection(int o, epoch_t e, MonMap *m) + MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), + op(0), epoch(0), quorum_features(0), paxos_first_version(0), + paxos_last_version(0) + { } + + MMonElection(int o, epoch_t e, MonMap *m, + version_t paxos_first, version_t paxos_last) : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION), - fsid(m->fsid), op(o), epoch(e), quorum_features(0) { + fsid(m->fsid), op(o), epoch(e), quorum_features(0), + paxos_first_version(paxos_first), paxos_last_version(paxos_last) + { // encode using full feature set; we will reencode for dest later, // if necessary m->encode(monmap_bl, CEPH_FEATURES_ALL); @@ -78,6 +87,8 @@ public: ::encode(monmap_bl, payload); ::encode(quorum, payload); ::encode(quorum_features, payload); + ::encode(paxos_first_version, payload); + ::encode(paxos_last_version, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); @@ -93,6 +104,10 @@ public: ::decode(quorum_features, p); else quorum_features = 0; + if (header.version >= 4) { + ::decode(paxos_first_version, p); + ::decode(paxos_last_version, p); + } } }; diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 2b5bcc2ae88a..122fe48846b3 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -80,18 +80,21 @@ void Elector::start() electing_me = true; acked_me[mon->rank] = CEPH_FEATURES_ALL; leader_acked = -1; + acked_first_paxos_version = mon->paxos->get_first_committed(); // bcast to everyone else for (unsigned i=0; imonmap->size(); ++i) { if ((int)i == mon->rank) continue; - Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap); + Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap, + mon->paxos->get_first_committed(), + mon->paxos->get_version()); mon->messenger->send_message(m, mon->monmap->get_inst(i)); } reset_timer(); } -void Elector::defer(int who) +void Elector::defer(int who, version_t paxos_first) { dout(5) << "defer to " << who << dendl; @@ -103,8 +106,11 @@ void Elector::defer(int who) // ack them leader_acked = who; + acked_first_paxos_version = paxos_first; ack_stamp = ceph_clock_now(g_ceph_context); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap), + mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap, + mon->paxos->get_first_committed(), + mon->paxos->get_version()), mon->monmap->get_inst(who)); // set a timer @@ -168,7 +174,10 @@ void Elector::victory() p != quorum.end(); ++p) { if (*p == mon->rank) continue; - MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap); + MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, + mon->monmap, + mon->paxos->get_first_committed(), + mon->paxos->get_version()); m->quorum = quorum; mon->messenger->send_message(m, mon->monmap->get_inst(*p)); } @@ -204,10 +213,13 @@ void Elector::handle_propose(MMonElection *m) } } - if (mon->rank < from) { + if ((mon->rank < from) && + // be careful that we have new enough data to be leader! + (m->paxos_first_version <= mon->paxos->get_version())) { // i would win over them. if (leader_acked >= 0) { // we already acked someone - assert(leader_acked < from); // and they still win, of course + assert((leader_acked < from) || // and they still win, of course + (acked_first_paxos_version > mon->paxos->get_version())); dout(5) << "no, we already acked " << leader_acked << dendl; } else { // wait, i should win! @@ -216,16 +228,20 @@ void Elector::handle_propose(MMonElection *m) mon->start_election(); } } - } else { + } else if (m->paxos_last_version >= mon->paxos->get_first_committed()) { // they would win over me if (leader_acked < 0 || // haven't acked anyone yet, or leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); + leader_acked == from) { // this is the guy we're already deferring to + defer(from, m->paxos_first_version); } else { // ignore them! dout(5) << "no, we already acked " << leader_acked << dendl; } + } else { // they are too out-of-date + dout(5) << "no, they are too far behind; paxos version: " + << m->paxos_last_version << " versus my first " + << mon->paxos->get_first_committed() << dendl; } m->put(); diff --git a/src/mon/Elector.h b/src/mon/Elector.h index d81eb2397633..9cce81e9f499 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -125,6 +125,10 @@ class Elector { * Indicates who we have acked */ int leader_acked; + /** + * Indicates the first_paxos_commit on who we've acked + */ + version_t acked_first_paxos_version; /** * Indicates when we have acked him */ @@ -197,16 +201,17 @@ class Elector { * to become the Leader. We will only defer an election if the monitor we * are deferring to outranks us. * - * @pre @p who outranks us (i.e., who < our rank) + * @pre @p who outranks us (who < our rank, or we're behind their store) * @pre @p who outranks any other monitor we have deferred to in the past * @post electing_me is false * @post leader_acked equals @p who * @post we sent an ack message to @p who * @post we reset the expire_event timer * - * @param who Some other monitor's numeric identifier. + * @param who Some other monitor's numeric identifier. + * @param paxos_first The other monitor's first committed paxos version */ - void defer(int who); + void defer(int who, version_t paxos_first); /** * The election has taken too long and has expired. * @@ -326,7 +331,8 @@ class Elector { epoch(0), participating(true), electing_me(false), - leader_acked(-1) { } + leader_acked(-1), + acked_first_paxos_version(0) { } /** * Initiate the Elector class. -- 2.47.3