From: Sage Weil Date: Thu, 25 Feb 2010 18:52:20 +0000 (-0800) Subject: osd: do not activate pg if lost osds and no acting has gone active X-Git-Tag: v0.20~402 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5b78f5db8c200edcc949033e1badae70fecd2e08;p=ceph.git osd: do not activate pg if lost osds and no acting has gone active If the no acting osd has gone active since it most recently joined the pg, then we may not have up to date pg state (log, etc). If may even be empty. If so, then do not activate even if an osd is marked lost. --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index ff3f9c527a1c..9f18cccd0abf 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -882,6 +882,33 @@ void PG::build_prior() past_intervals.begin()->first > info.history.last_epoch_started)) generate_past_intervals(); + // see if i have ever started since joining the pg. this is important only + // if we want to exclude lost osds. + set started_since_joining; + for (vector::iterator q = acting.begin(); q != acting.end(); q++) { + int o = *q; + + for (map::reverse_iterator p = past_intervals.rbegin(); + p != past_intervals.rend(); + p++) { + Interval &interval = p->second; + if (interval.last < info.history.last_epoch_started) + break; // we don't care + if (!interval.maybe_went_rw) + continue; + bool in = false; + for (vector::iterator q = interval.acting.begin(); q != interval.acting.end(); q++) + if (*q == o) + in = true; + if (in) + started_since_joining.insert(o); + else + break; + } + } + + dout(10) << "build_prior " << started_since_joining << " have started since joining this pg" << dendl; + for (map::reverse_iterator p = past_intervals.rbegin(); p != past_intervals.rend(); p++) { @@ -931,9 +958,19 @@ void PG::build_prior() // did any osds survive _this_ interval? any_survived = true; } else if (pinfo.lost_at > interval.first) { - dout(10) << "build_prior prior osd" << o - << " is down, but marked lost at " << pinfo.lost_at << dendl; prior_set_down.insert(o); + if (started_since_joining.size()) { + dout(10) << "build_prior prior osd" << o + << " is down, but marked lost at " << pinfo.lost_at + << ", and " << started_since_joining << " have started since joining pg" + << dendl; + } else { + dout(10) << "build_prior prior osd" << o + << " is down, but marked lost at " << pinfo.lost_at + << ", and NO acting osds have started since joining pg, so i may not have any pg state :/" + << dendl; + need_down++; + } } else { dout(10) << "build_prior prior osd" << o << " is down" << dendl;