]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: allow all paxos instances to recover before going writeable
authorSage Weil <sage@inktank.com>
Thu, 6 Sep 2012 20:31:51 +0000 (13:31 -0700)
committerSage Weil <sage@inktank.com>
Thu, 13 Sep 2012 00:33:00 +0000 (17:33 -0700)
Wait for all paxos instances to settle (recover, and commit any proposed
but uncommitted value) before any of them go writeable.  This will allow
the leader to choose global versions responsibly.

Signed-off-by: Sage Weil <sage@inktank.com>
src/mon/Monitor.cc
src/mon/Monitor.h
src/mon/Paxos.cc

index e51e85231e12ab0006ffc8826421ff32bcdfae77..b0ee90ffc7de97aacbad8add9c69021ed515c46c 100644 (file)
@@ -215,6 +215,16 @@ Monitor::~Monitor()
   delete mon_caps;
 }
 
+void Monitor::recovered_machine(int id)
+{
+  paxos_recovered.insert(id);
+  if (paxos_recovered.size() == paxos.size()) {
+    dout(10) << "all paxos instances recovered, going writeable" << dendl;
+    for (vector<Paxos*>::iterator p = paxos.begin(); p != paxos.end(); p++)
+      finish_contexts(g_ceph_context, (*p)->waiting_for_writeable);
+  }
+}
+
 enum {
   l_mon_first = 456000,
   l_mon_last,
@@ -996,6 +1006,8 @@ void Monitor::win_election(epoch_t epoch, set<int>& active, unsigned features)
           << " features are " << quorum_features
           << dendl;
 
+  paxos_recovered.clear();
+
   clog.info() << "mon." << name << "@" << rank
                << " won leader election with quorum " << quorum << "\n";
   
index 9e92de6be7f06b51a417f9b2d923bf0ca60ef709..4983f7570236724f76329b72183601d848076cf0 100644 (file)
@@ -187,6 +187,15 @@ private:
   list<Context*> waitfor_quorum;
   list<Context*> maybe_wait_for_quorum;
 
+  // multi-paxos global version sequencing kludge-o-rama
+  set<int> paxos_recovered;     ///< num paxos machines fully recovered during this election epoch
+public:
+  void recovered_machine(int id);
+  bool is_all_paxos_recovered() {
+    return paxos_recovered.size() == paxos.size();
+  }
+
+private:
   Context *probe_timeout_event;  // for probing and slurping states
 
   struct C_ProbeTimeout : public Context {
index b600716d9488092827d622f2f1ae9bf9487d23cf..53f6835b937b01d921016324c6260b1c81e2b5a2 100644 (file)
@@ -335,7 +335,9 @@ void Paxos::handle_last(MMonPaxos *last)
        // wake people up
        finish_contexts(g_ceph_context, waiting_for_active);
        finish_contexts(g_ceph_context, waiting_for_readable);
-       finish_contexts(g_ceph_context, waiting_for_writeable);
+       //finish_contexts(g_ceph_context, waiting_for_writeable);
+
+       mon->recovered_machine(machine_id);
       }
     }
   } else {
@@ -383,10 +385,14 @@ void Paxos::begin(bufferlist& v)
     // we're alone, take it easy
     commit();
     state = STATE_ACTIVE;
+
     finish_contexts(g_ceph_context, waiting_for_active);
     finish_contexts(g_ceph_context, waiting_for_commit);
     finish_contexts(g_ceph_context, waiting_for_readable);
-    finish_contexts(g_ceph_context, waiting_for_writeable);
+    //finish_contexts(g_ceph_context, waiting_for_writeable);
+
+    mon->recovered_machine(machine_id);
+
     return;
   }
 
@@ -488,13 +494,15 @@ void Paxos::handle_accept(MMonPaxos *accept)
     // yay!
     state = STATE_ACTIVE;
     extend_lease();
-  
+
     // wake people up
     finish_contexts(g_ceph_context, waiting_for_active);
     finish_contexts(g_ceph_context, waiting_for_commit);
     finish_contexts(g_ceph_context, waiting_for_readable);
-    finish_contexts(g_ceph_context, waiting_for_writeable);
-  }
+    //finish_contexts(g_ceph_context, waiting_for_writeable);
+
+    mon->recovered_machine(machine_id);
+    }
   accept->put();
 }
 
@@ -818,9 +826,11 @@ void Paxos::leader_init()
   new_value.clear();
 
   if (mon->get_quorum().size() == 1) {
-    state = STATE_ACTIVE;                          
+    state = STATE_ACTIVE;
+    mon->recovered_machine(machine_id);
     return;
-  } 
+  }
+
   state = STATE_RECOVERING;
   lease_expire = utime_t();
   dout(10) << "leader_init -- starting paxos recovery" << dendl;
@@ -947,6 +957,12 @@ version_t Paxos::read_current(bufferlist &bl)
 
 bool Paxos::is_writeable()
 {
+  // do not allow new paxos writes until all paxos machines have
+  // recovered.  this ensures that the global versions we choose at
+  // proposal time are sanely ordered.
+  if (!mon->is_all_paxos_recovered())
+    return false;
+
   if (mon->get_quorum().size() == 1) return true;
   return
     mon->is_leader() &&