]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
Revert "mon: when electing, be sure acked leaders have new enough stores to lead"
authorGreg Farnum <greg@inktank.com>
Tue, 30 Apr 2013 19:02:20 +0000 (12:02 -0700)
committerGreg Farnum <greg@inktank.com>
Tue, 30 Apr 2013 20:50:40 +0000 (13:50 -0700)
This was somehow broken -- out-of-date leaders were being elected -- and
we've decided smaller band-aids are more appropriate. We don't completely
revert the MMonElection changes, though -- there have been user clusters
running the code which includes these messages so we can't pretend it
never happened. We can make them clearly unused in the code, though.

This reverts commit fcaabf1a22723c571c10d402464071c6405607c0.

Signed-off-by: Greg Farnum <greg@inktank.com>
src/messages/MMonElection.h
src/mon/Elector.cc
src/mon/Elector.h

index 9771f6123d607245e6c4345404b6bbcd3495cb71..3d7dd4ec90ef12e6858d554a5c6538dc7c54a83c 100644 (file)
@@ -45,19 +45,20 @@ public:
   bufferlist monmap_bl;
   set<int> quorum;
   uint64_t quorum_features;
-  version_t paxos_first_version;
-  version_t paxos_last_version;
+  /* the following were both used in the next branch for a while
+   * on user cluster, so we've left them in for compatibility. */
+  version_t defunct_one;
+  version_t defunct_two;
   
   MMonElection() : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
-    op(0), epoch(0), quorum_features(0), paxos_first_version(0),
-    paxos_last_version(0)
+    op(0), epoch(0), quorum_features(0), defunct_one(0),
+    defunct_two(0)
   { }
 
-  MMonElection(int o, epoch_t e, MonMap *m,
-               version_t paxos_first, version_t paxos_last)
+  MMonElection(int o, epoch_t e, MonMap *m)
     : Message(MSG_MON_ELECTION, HEAD_VERSION, COMPAT_VERSION),
       fsid(m->fsid), op(o), epoch(e), quorum_features(0),
-      paxos_first_version(paxos_first), paxos_last_version(paxos_last)
+      defunct_one(0), defunct_two(0)
   {
     // encode using full feature set; we will reencode for dest later,
     // if necessary
@@ -87,8 +88,8 @@ public:
     ::encode(monmap_bl, payload);
     ::encode(quorum, payload);
     ::encode(quorum_features, payload);
-    ::encode(paxos_first_version, payload);
-    ::encode(paxos_last_version, payload);
+    ::encode(defunct_one, payload);
+    ::encode(defunct_two, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
@@ -105,8 +106,8 @@ public:
     else
       quorum_features = 0;
     if (header.version >= 4) {
-      ::decode(paxos_first_version, p);
-      ::decode(paxos_last_version, p);
+      ::decode(defunct_one, p);
+      ::decode(defunct_two, p);
     }
   }
   
index b6f047e20d2546ea140bbcd7f06bb5677e63b9a3..32d78b4eb4b8007f37a7941f492825586e624ce9 100644 (file)
@@ -81,21 +81,18 @@ void Elector::start()
   electing_me = true;
   acked_me[mon->rank] = CEPH_FEATURES_ALL;
   leader_acked = -1;
-  acked_first_paxos_version = mon->paxos->get_first_committed();
 
   // bcast to everyone else
   for (unsigned i=0; i<mon->monmap->size(); ++i) {
     if ((int)i == mon->rank) continue;
-    Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap,
-                                  mon->paxos->get_first_committed(),
-                                  mon->paxos->get_version());
+    Message *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
     mon->messenger->send_message(m, mon->monmap->get_inst(i));
   }
   
   reset_timer();
 }
 
-void Elector::defer(int who, version_t paxos_first)
+void Elector::defer(int who)
 {
   dout(5) << "defer to " << who << dendl;
 
@@ -107,11 +104,8 @@ void Elector::defer(int who, version_t paxos_first)
 
   // ack them
   leader_acked = who;
-  acked_first_paxos_version = paxos_first;
   ack_stamp = ceph_clock_now(g_ceph_context);
-  mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap,
-                                                mon->paxos->get_first_committed(),
-                                                mon->paxos->get_version()),
+  mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap),
                               mon->monmap->get_inst(who));
   
   // set a timer
@@ -175,10 +169,7 @@ void Elector::victory()
        p != quorum.end();
        ++p) {
     if (*p == mon->rank) continue;
-    MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
-                                       mon->monmap,
-                                       mon->paxos->get_first_committed(),
-                                       mon->paxos->get_version());
+    MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
     m->quorum = quorum;
     mon->messenger->send_message(m, mon->monmap->get_inst(*p));
   }
@@ -214,13 +205,10 @@ void Elector::handle_propose(MMonElection *m)
     }
   }
 
-  if ((mon->rank < from) &&
-      // be careful that we have new enough data to be leader!
-      (m->paxos_first_version <= mon->paxos->get_version())) {
+  if (mon->rank < from) {
     // i would win over them.
     if (leader_acked >= 0) {        // we already acked someone
-      assert((leader_acked < from) || // and they still win, of course
-             (acked_first_paxos_version > mon->paxos->get_version()));
+      assert(leader_acked < from);  // and they still win, of course
       dout(5) << "no, we already acked " << leader_acked << dendl;
     } else {
       // wait, i should win!
@@ -229,20 +217,16 @@ void Elector::handle_propose(MMonElection *m)
        mon->start_election();
       }
     }
-  } else if (m->paxos_last_version >= mon->paxos->get_first_committed()) {
+  } else {
     // they would win over me
     if (leader_acked < 0 ||      // haven't acked anyone yet, or
        leader_acked > from ||   // they would win over who you did ack, or
-       leader_acked == from) { // this is the guy we're already deferring to
-      defer(from, m->paxos_first_version);
+       leader_acked == from) {  // this is the guy we're already deferring to
+      defer(from);
     } else {
       // ignore them!
       dout(5) << "no, we already acked " << leader_acked << dendl;
     }
-  } else { // they are too out-of-date
-    dout(5) << "no, they are too far behind; paxos version: "
-           << m->paxos_last_version << " versus my first "
-           << mon->paxos->get_first_committed() << dendl;
   }
   
   m->put();
index 9cce81e9f49909fb559961edd85322b9001fb0bc..d81eb2397633589efcded86033bb2706480e7e2b 100644 (file)
@@ -125,10 +125,6 @@ class Elector {
    * Indicates who we have acked
    */
   int      leader_acked;
-  /**
-   * Indicates the first_paxos_commit on who we've acked
-   */
-  version_t acked_first_paxos_version;
   /**
    * Indicates when we have acked him
    */
@@ -201,17 +197,16 @@ class Elector {
    * to become the Leader. We will only defer an election if the monitor we
    * are deferring to outranks us.
    *
-   * @pre   @p who outranks us (who < our rank, or we're behind their store)
+   * @pre   @p who outranks us (i.e., who < our rank)
    * @pre   @p who outranks any other monitor we have deferred to in the past
    * @post  electing_me is false
    * @post  leader_acked equals @p who
    * @post  we sent an ack message to @p who
    * @post  we reset the expire_event timer
    *
-   * @param who Some other monitor's numeric identifier.
-   * @param paxos_first The other monitor's first committed paxos version
+   * @param who Some other monitor's numeric identifier. 
    */
-  void defer(int who, version_t paxos_first);
+  void defer(int who);
   /**
    * The election has taken too long and has expired.
    *
@@ -331,8 +326,7 @@ class Elector {
                               epoch(0),
                               participating(true),
                               electing_me(false),
-                              leader_acked(-1),
-                              acked_first_paxos_version(0) { }
+                              leader_acked(-1) { }
 
   /**
    * Initiate the Elector class.