This is extremely important, and it forces the MDS to get the osdmap that
includes the blacklist entry for its predecessor. This in turn means that
any OSD we contact trying to read the journal will be forced to get that
osdmap (or newer) before handling our read request, which means that
anything we read cannot be overwritten by a racing request from our
predecessor. This prevents two MDSs writing to the journal at the same
time.
This change fixes potential (and observed!) journal corruption.
Signed-off-by: Sage Weil <sage@newdream.net>
set<int> rs;
mdsmap->get_recovery_mds_set(rs);
rs.erase(whoami);
- dout(1) << "now replay. my recovery peers are " << rs << dendl;
+ dout(1) << "now replay. my recovery peers are " << rs
+ << ". need osdmap epoch " << mdsmap->get_last_failure_osd_epoch()
+ <<", have " << osdmap->get_epoch()
+ << dendl;
mdcache->set_recovery_set(rs);
// start?
- //if (osdmap->get_epoch() > 0 &&
- //mdsmap->get_epoch() > 0)
- boot_start();
+ if (osdmap->get_epoch() >= mdsmap->get_last_failure_osd_epoch()) {
+ boot_start();
+ } else {
+ objecter->wait_for_new_map(new C_MDS_BootStart(this, 0),
+ mdsmap->get_last_failure_osd_epoch());
+ }
}
void MDS::replay_done()