From: Samuel Just Date: Thu, 21 Apr 2011 20:36:13 +0000 (-0700) Subject: PG: Refactor build_prior into a PgPriorSet constructor. X-Git-Tag: v0.28~74^2~50 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1477c0682679962e0fa6668a58728913e0b0d342;p=ceph.git PG: Refactor build_prior into a PgPriorSet constructor. --- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 66cfd65f4c5..d0a2914a79f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1089,219 +1089,51 @@ void PG::build_prior() // sanity check for (map::iterator it = peer_info.begin(); it != peer_info.end(); - it++) + it++) { assert(info.history.last_epoch_started >= it->second.history.last_epoch_started); + } } + generate_past_intervals(); - /* - * We have to be careful to gracefully deal with situations like - * so. Say we have a power outage or something that takes out both - * OSDs, but the monitor doesn't mark them down in the same epoch. - * The history may look like - * - * 1: A B - * 2: B - * 3: let's say B dies for good, too (say, from the power spike) - * 4: A - * - * which makes it look like B may have applied updates to the PG - * that we need in order to proceed. This sucks... - * - * To minimize the risk of this happening, we CANNOT go active if - * _any_ OSDs in the prior set are down until we send an MOSDAlive - * to the monitor such that the OSDMap sets osd_up_thru to an epoch. - * Then, we have something like - * - * 1: A B - * 2: B alive_thru[B]=0 - * 3: - * 4: A - * - * -> we can ignore B, bc it couldn't have gone active (alive_thru - * still 0). - * - * or, - * - * 1: A B - * 2: B alive_thru[B]=0 - * 3: B alive_thru[B]=2 - * 4: - * 5: A - * - * -> we must wait for B, bc it was alive through 2, and could have - written to the pg. - * - * If B is really dead, then an administrator will need to manually - * intervene by marking the OSD as "lost." - */ - - prior_set.reset(new PgPriorSet()); - PgPriorSet& prior(*prior_set.get()); - - // current up and/or acting nodes, of course. - for (unsigned i=0; iwhoami) - prior.cur.insert(up[i]); - for (unsigned i=0; iwhoami) - prior.cur.insert(acting[i]); - - // and prior PG mappings. move backwards in time. state_clear(PG_STATE_CRASHED); state_clear(PG_STATE_DOWN); - bool some_down = false; - // generate past intervals, if we don't have them. - generate_past_intervals(); - - // see if i have ever started since joining the pg. this is important only - // if we want to exclude lost osds. - set started_since_joining; - for (vector::iterator q = acting.begin(); q != acting.end(); q++) { - int o = *q; - - for (map::reverse_iterator p = past_intervals.rbegin(); - p != past_intervals.rend(); - p++) { - Interval &interval = p->second; - if (interval.last < info.history.last_epoch_started) - break; // we don't care - if (!interval.maybe_went_rw) - continue; - if (std::find(interval.acting.begin(), interval.acting.end(), o) - != interval.acting.end()) - started_since_joining.insert(o); - break; - } + stringstream out; + prior_set.reset(new PgPriorSet(osd->whoami, + *osd->osdmap, + past_intervals, + up, + acting, + info, + this)); + dout(10) << out << dendl; + PgPriorSet &prior(*prior_set.get()); + + + dout(10) << "build_prior: " << *this << " " + << (prior.crashed ? " crashed":"") + << (prior.pg_down ? " down":"") + << (prior.some_down ? " some_down":"") + << dendl; + // take note that we care about the primary's up_thru. if it + // changes later, it will affect our prior_set, and we'll want + // to rebuild it! + if (prior.crashed) { + state_set(PG_STATE_CRASHED); } - - dout(10) << "build_prior " << started_since_joining << " have started since joining this pg" << dendl; - - for (map::reverse_iterator p = past_intervals.rbegin(); - p != past_intervals.rend(); - p++) { - Interval &interval = p->second; - dout(10) << "build_prior " << interval << dendl; - - if (interval.last < info.history.last_epoch_started) - break; // we don't care - - if (interval.acting.empty()) - continue; - - int crashed = 0; - int need_down = 0; - bool any_survived = false; - - // consider UP osds - for (unsigned i=0; iosdmap->is_up(o)) { // is up now - if (o != osd->whoami) // and is not me - prior.cur.insert(o); - } - } - - // consider ACTING osds - for (unsigned i=0; iosdmap->exists(o)) - pinfo = &osd->osdmap->get_info(o); - - // if the osd restarted after this interval but is not known to have - // cleanly survived through this interval, we mark the pg crashed. - if (pinfo && (pinfo->up_from > interval.last && - !(pinfo->last_clean_first <= interval.first && - pinfo->last_clean_last >= interval.last))) { - dout(10) << "build_prior prior osd" << o - << " up_from " << pinfo->up_from - << " and last clean interval " << pinfo->last_clean_first << "-" << pinfo->last_clean_last - << " does not include us" << dendl; - crashed++; - } - - if (osd->osdmap->is_up(o)) { // is up now - // did any osds survive _this_ interval? - any_survived = true; - } else if (!pinfo || pinfo->lost_at > interval.first) { - prior.down.insert(0); - if (started_since_joining.size()) { - if (pinfo) - dout(10) << "build_prior prior osd" << o - << " is down, but marked lost at " << pinfo->lost_at - << ", and " << started_since_joining << " have started since joining pg" - << dendl; - else - dout(10) << "build_prior prior osd" << o - << " no longer exists, and " << started_since_joining << " have started since joining pg" - << dendl; - - } else { - if (pinfo) - dout(10) << "build_prior prior osd" << o - << " is down, but marked lost at " << pinfo->lost_at - << ", and NO acting osds have started since joining pg, so i may not have any pg state :/" - << dendl; - else - dout(10) << "build_prior prior osd" << o - << " no longer exists, and NO acting osds have started since joining pg, so i may not have any pg state :/" - << dendl; - need_down++; - } - } else { - dout(10) << "build_prior prior osd" << o - << " is down" << dendl; - need_down++; - prior.down.insert(o); - } - } - - // if nobody survived this interval, and we may have gone rw, - // then we need to wait for one of those osds to recover to - // ensure that we haven't lost any information. - if (!any_survived && need_down && interval.maybe_went_rw) { - // fixme: how do we identify a "clean" shutdown anyway? - dout(10) << "build_prior " << need_down - << " osds possibly went active+rw, no survivors, including" << dendl; - for (unsigned i=0; iosdmap->is_down(interval.acting[i])) { - prior.cur.insert(interval.acting[i]); - state_set(PG_STATE_DOWN); - } - some_down = true; - - // take note that we care about the primary's up_thru. if it - // changes later, it will affect our prior_set, and we'll want - // to rebuild it! - OSDMap *lastmap = osd->get_map(interval.last); - prior.up_thru[interval.acting[0]] = lastmap->get_up_thru(interval.acting[0]); - } - - if (crashed) { - dout(10) << "build_prior one of " << interval.acting - << " possibly crashed, marking pg crashed" << dendl; - state_set(PG_STATE_CRASHED); - } + if (prior.pg_down) { + state_set(PG_STATE_DOWN); } - // Build prior_set.lost - for (set::const_iterator i = prior.cur.begin(); - i != prior.cur.end(); ++i) { - int o = *i; - const osd_info_t& pinfo(osd->osdmap->get_info(o)); - if (pinfo.lost_at > pinfo.up_from) { - prior.lost.insert(o); + if (prior.some_down) { + need_up_thru = true; + for (vector::iterator i = prior.inter_up_thru.begin(); + i != prior.inter_up_thru.end(); + ++i) { + OSDMap *lastmap = osd->get_map(i->last); + prior.up_thru[i->acting[0]] = lastmap->get_up_thru(i->acting[0]); } } - - dout(10) << "build_prior: " << prior << " " - << (is_crashed() ? " crashed":"") - << (is_down() ? " down":"") - << (some_down ? " some_down":"") - << dendl; } void PG::clear_primary_state() @@ -3876,3 +3708,201 @@ std::ostream& operator<<(std::ostream& oss, << "lost=" << prior.lost << " ]]"; return oss; } +#undef dout_prefix +#define dout_prefix (*_dout << pg->gen_prefix() << "PgPriorSet: ") + +PG::PgPriorSet::PgPriorSet(int whoami, + const OSDMap &osdmap, + const map &past_intervals, + const vector &up, + const vector &acting, + const PG::Info &info, + const PG* pg) + : crashed(false), pg_down(false), some_down(false), pg(pg) +{ + /* + * We have to be careful to gracefully deal with situations like + * so. Say we have a power outage or something that takes out both + * OSDs, but the monitor doesn't mark them down in the same epoch. + * The history may look like + * + * 1: A B + * 2: B + * 3: let's say B dies for good, too (say, from the power spike) + * 4: A + * + * which makes it look like B may have applied updates to the PG + * that we need in order to proceed. This sucks... + * + * To minimize the risk of this happening, we CANNOT go active if + * _any_ OSDs in the prior set are down until we send an MOSDAlive + * to the monitor such that the OSDMap sets osd_up_thru to an epoch. + * Then, we have something like + * + * 1: A B + * 2: B alive_thru[B]=0 + * 3: + * 4: A + * + * -> we can ignore B, bc it couldn't have gone active (alive_thru + * still 0). + * + * or, + * + * 1: A B + * 2: B alive_thru[B]=0 + * 3: B alive_thru[B]=2 + * 4: + * 5: A + * + * -> we must wait for B, bc it was alive through 2, and could have + written to the pg. + * + * If B is really dead, then an administrator will need to manually + * intervene by marking the OSD as "lost." + */ + // current up and/or acting nodes, of course. + for (unsigned i=0; i started_since_joining; + for (vector::const_iterator q = acting.begin(); q != acting.end(); q++) { + int o = *q; + + for (map::const_reverse_iterator p = past_intervals.rbegin(); + p != past_intervals.rend(); + p++) { + const Interval &interval = p->second; + if (interval.last < info.history.last_epoch_started) + break; // we don't care + if (!interval.maybe_went_rw) + continue; + if (std::find(interval.acting.begin(), interval.acting.end(), o) + != interval.acting.end()) + started_since_joining.insert(o); + break; + } + } + + dout(10) << "build_prior " << started_since_joining << " have started since joining this pg" << dendl; + + for (map::const_reverse_iterator p = past_intervals.rbegin(); + p != past_intervals.rend(); + p++) { + const Interval &interval = p->second; + dout(10) << "build_prior " << interval << dendl; + + if (interval.last < info.history.last_epoch_started) + break; // we don't care + + if (interval.acting.empty()) + continue; + + int need_down = 0; + bool any_survived = false; + + // consider UP osds + for (unsigned i=0; iup_from > interval.last && + !(pinfo->last_clean_first <= interval.first && + pinfo->last_clean_last >= interval.last))) { + dout(10) << "build_prior prior osd" << o + << " up_from " << pinfo->up_from + << " and last clean interval " << pinfo->last_clean_first << "-" << pinfo->last_clean_last + << " does not include us" << dendl; + crashed = true; + } + + if (osdmap.is_up(o)) { // is up now + // did any osds survive _this_ interval? + any_survived = true; + } else if (!pinfo || pinfo->lost_at > interval.first) { + down.insert(0); + if (started_since_joining.size()) { + if (pinfo) + dout(10) << "build_prior prior osd" << o + << " is down, but marked lost at " << pinfo->lost_at + << ", and " << started_since_joining << " have started since joining pg" + << dendl; + else + dout(10) << "build_prior prior osd" << o + << " no longer exists, and " << started_since_joining << " have started since joining pg" + << dendl; + + } else { + if (pinfo) + dout(10) << "build_prior prior osd" << o + << " is down, but marked lost at " << pinfo->lost_at + << ", and NO acting osds have started since joining pg, so i may not have any pg state :/" + << dendl; + else + dout(10) << "build_prior prior osd" << o + << " no longer exists, and NO acting osds have started since joining pg, so i may not have any pg state :/" + << dendl; + need_down++; + } + } else { + dout(10) << "build_prior prior osd" << o + << " is down" << dendl; + need_down++; + down.insert(o); + } + } + + // if nobody survived this interval, and we may have gone rw, + // then we need to wait for one of those osds to recover to + // ensure that we haven't lost any information. + if (!any_survived && need_down && interval.maybe_went_rw) { + // fixme: how do we identify a "clean" shutdown anyway? + dout(10) << "build_prior " << need_down + << " osds possibly went active+rw, no survivors, including" << dendl; + for (unsigned i=0; i::const_iterator i = cur.begin(); + i != cur.end(); ++i) { + int o = *i; + const osd_info_t& pinfo(osdmap.get_info(o)); + if (pinfo.lost_at > pinfo.up_from) { + lost.insert(o); + } + } + +} diff --git a/src/osd/PG.h b/src/osd/PG.h index bcf2c6fe19b..fb986995740 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -745,6 +745,18 @@ public: set down; // down osds normally exluded from cur set lost; // osds in the prior set which are lost map up_thru; // osds whose up_thru we care about + vector inter_up_thru; // intervals whose up_thru we care about + bool crashed; + bool pg_down; + bool some_down; + const PG *pg; + PgPriorSet(int whoami, + const OSDMap &osdmap, + const map &past_intervals, + const vector &up, + const vector &acting, + const Info &info, + const PG *pg); }; friend std::ostream& operator<<(std::ostream& oss,