mon: preserve last_committed_floor across sync

author Sage Weil <sage@inktank.com>

Tue, 9 Jul 2013 17:55:05 +0000 (10:55 -0700)

committer Sage Weil <sage@inktank.com>

Tue, 9 Jul 2013 18:05:49 +0000 (11:05 -0700)
author Sage Weil <sage@inktank.com>
Tue, 9 Jul 2013 17:55:05 +0000 (10:55 -0700)
committer Sage Weil <sage@inktank.com>
Tue, 9 Jul 2013 18:05:49 +0000 (11:05 -0700)
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc

index 709a0e110cc3c0efd2562b658870341e2a47d769..7edc574ea190b8390feb30575c1ce70c3b7f5cc5 100644 (file)
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -150,6 +150,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
    sync_full(false),
    sync_start_version(0),
    sync_timeout_event(NULL),
+  sync_last_committed_floor(0),
  
    timecheck_round(0),
    timecheck_acks(0),
@@ -429,6 +430,9 @@ int Monitor::preinit()
      }
    }
  
+  sync_last_committed_floor = store->get("mon_sync", "last_committed_floor");
+  dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl;
+
    init_paxos();
    health_monitor->init();
  
@@ -839,8 +843,13 @@ void Monitor::sync_start(entity_inst_t &other, bool full)
      sync_obtain_latest_monmap(backup_monmap);
      assert(backup_monmap.length() > 0);
  
+    sync_last_committed_floor = MAX(sync_last_committed_floor, paxos->get_version());
+    dout(10) << __func__ << " marking sync in progress, storing sync_last_commited_floor "
+            << sync_last_committed_floor << dendl;
+
      t.put("mon_sync", "latest_monmap", backup_monmap);
      t.put("mon_sync", "in_sync", 1);
+    t.put("mon_sync", "last_committed_floor", sync_last_committed_floor);
      store->apply_transaction(t);
  
      assert(g_conf->mon_sync_requester_kill_at != 1);
@@ -900,6 +909,7 @@ void Monitor::sync_finish(version_t last_committed)
    MonitorDBStore::Transaction t;
    t.erase("mon_sync", "in_sync");
    t.erase("mon_sync", "force_sync");
+  t.erase("mon_sync", "last_committed_floor");
    store->apply_transaction(t);
  
    sync_reset();
@@ -1314,27 +1324,34 @@ void Monitor::handle_probe_reply(MMonProbe *m)
  
    entity_inst_t other = m->get_source_inst();
  
-  if (paxos->get_version() < m->paxos_first_version &&
-      m->paxos_first_version > 1) {  // no need to sync if we're 0 and they start at 1.
+  if (m->paxos_last_version < sync_last_committed_floor) {
      dout(10) << " peer paxos versions [" << m->paxos_first_version
-            << "," << m->paxos_last_version << "]"
-            << " vs my version " << paxos->get_version()
-            << " (too far ahead)"
-            << dendl;
-    cancel_probe_timeout();
-    sync_start(other, true);
-    m->put();
-    return;
-  }
-  if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
-    dout(10) << " peer paxos version " << m->paxos_last_version
-            << " vs my version " << paxos->get_version()
-            << " (too far ahead)"
+            << "," << m->paxos_last_version << "] < my sync_last_committed_floor "
+            << sync_last_committed_floor << ", ignoring"
              << dendl;
-    cancel_probe_timeout();
-    sync_start(other, false);
-    m->put();
-    return;
+  } else {
+    if (paxos->get_version() < m->paxos_first_version &&
+       m->paxos_first_version > 1) {  // no need to sync if we're 0 and they start at 1.
+      dout(10) << " peer paxos versions [" << m->paxos_first_version
+              << "," << m->paxos_last_version << "]"
+              << " vs my version " << paxos->get_version()
+              << " (too far ahead)"
+              << dendl;
+      cancel_probe_timeout();
+      sync_start(other, true);
+      m->put();
+      return;
+    }
+    if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
+      dout(10) << " peer paxos version " << m->paxos_last_version
+              << " vs my version " << paxos->get_version()
+              << " (too far ahead)"
+              << dendl;
+      cancel_probe_timeout();
+      sync_start(other, false);
+      m->put();
+      return;
+    }
    }
  
    // is there an existing quorum?
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h

index b875fbc6ee09f23da307fd117e0ae84f9134e633..cbc91529c214c162ffebd1d090793cecbaed9cc3 100644 (file)
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -252,6 +252,31 @@ private:
    version_t sync_start_version;  ///< last_committed at sync start
    Context *sync_timeout_event;   ///< timeout event
  
+  /**
+   * floor for sync source
+   *
+   * When we sync we forget about our old last_committed value which
+   * can be dangerous.  For example, if we have a cluster of:
+   *
+   *   mon.a: lc 100
+   *   mon.b: lc 80
+   *   mon.c: lc 100 (us)
+   *
+   * If something forces us to sync (say, corruption, or manual
+   * intervention, or bug), we forget last_committed, and might abort.
+   * If mon.a happens to be down when we come back, we will see:
+   *
+   *   mon.b: lc 80
+   *   mon.c: lc 0 (us)
+   *
+   * and sync from mon.b, at which point a+b will both have lc 80 and
+   * come online with a majority holding out of date commits.
+   *
+   * Avoid this by preserving our old last_committed value prior to
+   * sync and never going backwards.
+   */
+  version_t sync_last_committed_floor;
+
    struct C_SyncTimeout : public Context {
      Monitor *mon;
      C_SyncTimeout(Monitor *m) : mon(m) {}
author	Sage Weil <sage@inktank.com>
	Tue, 9 Jul 2013 17:55:05 +0000 (10:55 -0700)
committer	Sage Weil <sage@inktank.com>
	Tue, 9 Jul 2013 18:05:49 +0000 (11:05 -0700)
src/mon/Monitor.cc		patch \| blob \| history
src/mon/Monitor.h		patch \| blob \| history