From: Sage Weil Date: Thu, 4 Nov 2010 05:22:54 +0000 (-0700) Subject: mds: wait for last_failure_osd_epoch before starting journal replay X-Git-Tag: v0.23~19^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1c934ebd6ff3a3a7000671821a12e83c609f1e27;p=ceph.git mds: wait for last_failure_osd_epoch before starting journal replay This is extremely important, and it forces the MDS to get the osdmap that includes the blacklist entry for its predecessor. This in turn means that any OSD we contact trying to read the journal will be forced to get that osdmap (or newer) before handling our read request, which means that anything we read cannot be overwritten by a racing request from our predecessor. This prevents two MDSs writing to the journal at the same time. This change fixes potential (and observed!) journal corruption. Signed-off-by: Sage Weil --- diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 41d893a72e4..1ed30911fa3 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -1135,13 +1135,19 @@ void MDS::replay_start() set rs; mdsmap->get_recovery_mds_set(rs); rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << dendl; + dout(1) << "now replay. my recovery peers are " << rs + << ". need osdmap epoch " << mdsmap->get_last_failure_osd_epoch() + <<", have " << osdmap->get_epoch() + << dendl; mdcache->set_recovery_set(rs); // start? - //if (osdmap->get_epoch() > 0 && - //mdsmap->get_epoch() > 0) - boot_start(); + if (osdmap->get_epoch() >= mdsmap->get_last_failure_osd_epoch()) { + boot_start(); + } else { + objecter->wait_for_new_map(new C_MDS_BootStart(this, 0), + mdsmap->get_last_failure_osd_epoch()); + } } void MDS::replay_done()