From: Sage Weil Date: Tue, 9 Jul 2013 17:55:05 +0000 (-0700) Subject: mon: preserve last_committed_floor across sync X-Git-Tag: v0.67-rc1~126^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8b866d2e34bf872e6eed1bdb6e6427c8629fe6eb;p=ceph.git mon: preserve last_committed_floor across sync Add a paranoid check to prevent us from forgetting how far ahead our last_committed was when we sync. This prevents an i'll-timed forced-sync from allowing paxos to warp back in time. This should never happen unless there is a perfect storm of bad admin decisions and/or bugs, but we guard against it anyway. See: #5256 Signed-off-by: Sage Weil --- diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 709a0e110cc3..7edc574ea190 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -150,6 +150,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s, sync_full(false), sync_start_version(0), sync_timeout_event(NULL), + sync_last_committed_floor(0), timecheck_round(0), timecheck_acks(0), @@ -429,6 +430,9 @@ int Monitor::preinit() } } + sync_last_committed_floor = store->get("mon_sync", "last_committed_floor"); + dout(10) << "sync_last_committed_floor " << sync_last_committed_floor << dendl; + init_paxos(); health_monitor->init(); @@ -839,8 +843,13 @@ void Monitor::sync_start(entity_inst_t &other, bool full) sync_obtain_latest_monmap(backup_monmap); assert(backup_monmap.length() > 0); + sync_last_committed_floor = MAX(sync_last_committed_floor, paxos->get_version()); + dout(10) << __func__ << " marking sync in progress, storing sync_last_commited_floor " + << sync_last_committed_floor << dendl; + t.put("mon_sync", "latest_monmap", backup_monmap); t.put("mon_sync", "in_sync", 1); + t.put("mon_sync", "last_committed_floor", sync_last_committed_floor); store->apply_transaction(t); assert(g_conf->mon_sync_requester_kill_at != 1); @@ -900,6 +909,7 @@ void Monitor::sync_finish(version_t last_committed) MonitorDBStore::Transaction t; t.erase("mon_sync", "in_sync"); t.erase("mon_sync", "force_sync"); + t.erase("mon_sync", "last_committed_floor"); store->apply_transaction(t); sync_reset(); @@ -1314,27 +1324,34 @@ void Monitor::handle_probe_reply(MMonProbe *m) entity_inst_t other = m->get_source_inst(); - if (paxos->get_version() < m->paxos_first_version && - m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. + if (m->paxos_last_version < sync_last_committed_floor) { dout(10) << " peer paxos versions [" << m->paxos_first_version - << "," << m->paxos_last_version << "]" - << " vs my version " << paxos->get_version() - << " (too far ahead)" - << dendl; - cancel_probe_timeout(); - sync_start(other, true); - m->put(); - return; - } - if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) { - dout(10) << " peer paxos version " << m->paxos_last_version - << " vs my version " << paxos->get_version() - << " (too far ahead)" + << "," << m->paxos_last_version << "] < my sync_last_committed_floor " + << sync_last_committed_floor << ", ignoring" << dendl; - cancel_probe_timeout(); - sync_start(other, false); - m->put(); - return; + } else { + if (paxos->get_version() < m->paxos_first_version && + m->paxos_first_version > 1) { // no need to sync if we're 0 and they start at 1. + dout(10) << " peer paxos versions [" << m->paxos_first_version + << "," << m->paxos_last_version << "]" + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, true); + m->put(); + return; + } + if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) { + dout(10) << " peer paxos version " << m->paxos_last_version + << " vs my version " << paxos->get_version() + << " (too far ahead)" + << dendl; + cancel_probe_timeout(); + sync_start(other, false); + m->put(); + return; + } } // is there an existing quorum? diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index b875fbc6ee09..cbc91529c214 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -252,6 +252,31 @@ private: version_t sync_start_version; ///< last_committed at sync start Context *sync_timeout_event; ///< timeout event + /** + * floor for sync source + * + * When we sync we forget about our old last_committed value which + * can be dangerous. For example, if we have a cluster of: + * + * mon.a: lc 100 + * mon.b: lc 80 + * mon.c: lc 100 (us) + * + * If something forces us to sync (say, corruption, or manual + * intervention, or bug), we forget last_committed, and might abort. + * If mon.a happens to be down when we come back, we will see: + * + * mon.b: lc 80 + * mon.c: lc 0 (us) + * + * and sync from mon.b, at which point a+b will both have lc 80 and + * come online with a majority holding out of date commits. + * + * Avoid this by preserving our old last_committed value prior to + * sync and never going backwards. + */ + version_t sync_last_committed_floor; + struct C_SyncTimeout : public Context { Monitor *mon; C_SyncTimeout(Monitor *m) : mon(m) {}