From 4a1a33f7c5e66f55b3d0ae063a19f91ddd2f9160 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 6 Nov 2008 15:03:49 -0800 Subject: [PATCH] osd: improve build_prior logic If, during some interval since the pg last went active, we may have gone rw, but none of the osds survived, then we include all of those osds in the prior_set (even tho they're down), because they may have written data that we want. The prior logic appears to have been broken. It was only looking at the primary osd. --- src/osd/PG.cc | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index f05309df4330f..86bfc6f0b862c 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -693,45 +693,53 @@ void PG::build_prior() << dendl; int num_still_up_or_clean = 0; + bool any_survived = false; for (unsigned i=0; iosdmap->is_up(acting[i])) { // is up now - if (first_epoch <= info.history.last_epoch_started && // FIXME this is sloppy i think? + + // FIXME this is sloppy i think? + if (first_epoch <= info.history.last_epoch_started && last_epoch >= info.history.last_epoch_started) any_up = true; + any_survived = true; + if (acting[i] != osd->whoami) // and is not me prior_set.insert(acting[i]); - + // has it been up this whole time? if (osd->osdmap->get_up_from(acting[i]) <= first_epoch) num_still_up_or_clean++; } else { - dout(10) << "build_prior prior osd" << acting[i] << " is down, must notify mon" << dendl; + dout(10) << "build_prior prior osd" << acting[i] + << " is down, must notify mon" << dendl; must_notify_mon = true; - - // include osd in set anyway prior_set_down.insert(acting[i]); + } + } - // fixme: how do we identify a "clean" shutdown anyway? - - if (i == 0) { - if (maybe_went_rw) { - dout(10) << "build_prior pg possibly went active+rw in epoch " - << (lastmap->get_up_thru(acting[i]) + 1) << dendl; - some_down = true; - prior_set.insert(acting[i]); - - // take note that we care about this osd's up_thru. if it - // changes later, it will affect our prior_set, and we'll want - // to rebuild the prior set! - prior_set_up_thru[acting[0]] = lastmap->get_up_thru(acting[0]); - } + // if nobody survived this interval, and we may have gone rw, + // then we need to wait for one of those osds to recover to + // ensure that we haven't lost any information. + if (!any_survived && maybe_went_rw) { + // fixme: how do we identify a "clean" shutdown anyway? + dout(10) << "build_prior possibly went active+rw, no survivors, including" << dendl; + for (unsigned i=0; iosdmap->is_down(acting[i])) { + prior_set.insert(acting[i]); + prior_set_down.erase(acting[i]); } - } + some_down = true; + + // take note that we care about the primary's up_thru. if it + // changes later, it will affect our prior_set, and we'll want + // to rebuild it! + prior_set_up_thru[acting[0]] = lastmap->get_up_thru(acting[0]); } if (num_still_up_or_clean == 0) { - dout(10) << "build_prior none of " << acting << " still up or cleanly shutdown, pg crashed" << dendl; + dout(10) << "build_prior none of " << acting + << " still up or cleanly shutdown, pg crashed" << dendl; state_set(PG_STATE_CRASHED); } } -- 2.39.5