From 4823b0719fe898a83e8c55f82b2f884b2df9040c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Nov 2008 11:30:17 -0800 Subject: [PATCH] osd: simplify osdmap tracking of osd up/down epochs; fix pg build_prior logic Use a single struct to track all of our osd up/down info. Include down_at, the epoch we last marked the osd down. Fix PG::build_prior to require that the osd was clean through the _entire_ interval in question. In monitor, adjust new clean interval foward to down_at-1 if the up_from matches the interval we mounted. That is, if the OSD shut down cleanly, it obviously remained clean at least until we marked it down in the map. --- src/mon/OSDMonitor.cc | 25 +++++++-- src/osd/OSDMap.h | 125 ++++++++++++++++++++++++++---------------- src/osd/PG.cc | 17 +++--- 3 files changed, 108 insertions(+), 59 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 0c8b8dce45a99..1bfd256305dbe 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -499,7 +499,7 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m) bool OSDMonitor::prepare_boot(MOSDBoot *m) { - dout(7) << "prepare_boot from " << m->get_orig_source_inst() << dendl; + dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb << dendl; assert(m->get_orig_source().is_osd()); int from = m->get_orig_source().num(); @@ -530,9 +530,26 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m) if (m->sb.weight) osd_weight[from] = m->sb.weight; - // note last clean unmount epoch - pending_inc.new_last_clean_interval[from] = - pair(m->sb.epoch_mounted, m->sb.epoch_unmounted); + // adjust last clean unmount epoch? + const osd_info_t& info = osdmap.get_info(from); + dout(10) << " old osd_info: " << info << dendl; + if (m->sb.epoch_mounted > info.last_clean_first || + (m->sb.epoch_mounted == info.last_clean_first && + m->sb.epoch_unmounted > info.last_clean_last)) { + epoch_t first = m->sb.epoch_mounted; + epoch_t last = m->sb.epoch_unmounted; + + // adjust clean interval forward to the epoch the osd was actually marked down. + if (info.up_from == first && + (info.down_at-1) > last) + last = info.down_at-1; + + dout(10) << "prepare_boot osd" << from << " last_clean_interval " + << info.last_clean_first << "-" << info.last_clean_last + << " -> " << first << "-" << last + << dendl; + pending_inc.new_last_clean_interval[from] = pair(first, last); + } // wait paxos->wait_for_commit(new C_Booted(this, m)); diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index bf5d9076d1acb..b196b43a2009a 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -62,6 +62,53 @@ inline int calc_bits_of(int t) { +/* + * we track up to two intervals during which the osd was alive and + * healthy. the most recent is [up_from,up_thru), where up_thru is + * the last epoch the osd is known to have _started_. i.e., a lower + * bound on the actual osd death. down_at (if it is > up_from) is an + * upper bound on the actual osd death. + * + * the second is the last_clean interval [first,last]. in that case, + * the last interval is the last epoch known to have been either + * _finished_, or during which the osd cleanly shut down. when + * possible, we push this forward to the epoch the osd was eventually + * marked down. + */ +struct osd_info_t { + epoch_t last_clean_first; // last interval that ended with a clean osd shutdown + epoch_t last_clean_last; + epoch_t up_from; // epoch osd marked up + epoch_t up_thru; // lower bound on actual osd death + epoch_t down_at; // upper bound on actual osd death (if > up_from) + + osd_info_t() : last_clean_first(0), last_clean_last(0), + up_from(0), up_thru(0), down_at(0) {} + void encode(bufferlist& bl) const { + ::encode(last_clean_first, bl); + ::encode(last_clean_last, bl); + ::encode(up_from, bl); + ::encode(up_thru, bl); + ::encode(down_at, bl); + } + void decode(bufferlist::iterator& bl) { + ::decode(last_clean_first, bl); + ::decode(last_clean_last, bl); + ::decode(up_from, bl); + ::decode(up_thru, bl); + ::decode(down_at, bl); + } +}; +WRITE_CLASS_ENCODER(osd_info_t) + +inline ostream& operator<<(ostream& out, const osd_info_t& info) { + return out << "up_from " << info.up_from + << " up_thru " << info.up_thru + << " down_at " << info.down_at + << " last_clean_interval " << info.last_clean_first << "-" << info.last_clean_last; +} + + /** OSDMap */ class OSDMap { @@ -201,20 +248,7 @@ private: vector osd_state; vector osd_addr; vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" - - /* - * we track up to two intervals during which the osd was alive and - * healthy. the most recent is [up_from,up_thru), where up_thru is the - * last epoch the osd is known to have _started_. i.e., a lower bound on the - * actual osd death. - * - * the second is last_clean_interval [first,last]. in that case, the last - * interval is the last epoch known to have been either _finished_, or during - * which the osd cleanly shut down. - */ - vector osd_up_from; // when it went up - vector osd_up_thru; // lower bound on _actual_ osd death. - vector > osd_last_clean_interval; + vector osd_info; map pg_swap_primary; // force new osd to be pg primary (if already a member) snapid_t max_snap; @@ -285,16 +319,10 @@ private: int o = max_osd; max_osd = m; osd_state.resize(m); - osd_up_from.resize(m); - osd_up_thru.resize(m); - osd_last_clean_interval.resize(m); + osd_info.resize(m); osd_weight.resize(m); for (; o get_last_clean_interval(int osd) { + osd_info_t& get_info(int osd) { assert(exists(osd)); - return osd_last_clean_interval[osd]; + return osd_info[osd]; } int get_any_up_osd() { @@ -452,37 +484,42 @@ private: if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); + for (map::iterator i = inc.new_weight.begin(); + i != inc.new_weight.end(); + i++) + set_weight(i->first, i->second); + + // up/down for (map::iterator i = inc.new_down.begin(); i != inc.new_down.end(); i++) { assert(osd_state[i->first] & CEPH_OSD_UP); osd_state[i->first] &= ~CEPH_OSD_UP; + osd_info[i->first].down_at = epoch; //cout << "epoch " << epoch << " down osd" << i->first << endl; } - for (map::iterator i = inc.new_weight.begin(); - i != inc.new_weight.end(); - i++) - set_weight(i->first, i->second); + for (map::iterator i = inc.new_up.begin(); + i != inc.new_up.end(); + i++) { + osd_state[i->first] |= CEPH_OSD_UP; + osd_addr[i->first] = i->second; + osd_info[i->first].up_from = epoch; + //cout << "epoch " << epoch << " up osd" << i->first << " at " << i->second << endl; + } + // info for (map::iterator i = inc.new_up_thru.begin(); i != inc.new_up_thru.end(); i++) - osd_up_thru[i->first] = i->second; - + osd_info[i->first].up_thru = i->second; for (map >::iterator i = inc.new_last_clean_interval.begin(); i != inc.new_last_clean_interval.end(); - i++) - osd_last_clean_interval[i->first] = i->second; - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); i++) { - osd_state[i->first] |= CEPH_OSD_UP; - osd_addr[i->first] = i->second; - osd_up_from[i->first] = epoch; - //cout << "epoch " << epoch << " up osd" << i->first << " at " << i->second << endl; + osd_info[i->first].last_clean_first = i->second.first; + osd_info[i->first].last_clean_last = i->second.second; } + // pg swap for (map::iterator i = inc.new_pg_swap_primary.begin(); i != inc.new_pg_swap_primary.end(); i++) @@ -528,9 +565,7 @@ private: ::encode(cbl, blist); // extended - ::encode(osd_up_from, blist); - ::encode(osd_up_thru, blist); - ::encode(osd_last_clean_interval, blist); + ::encode(osd_info, blist); ::encode(pg_swap_primary, blist); ::encode(max_snap, blist); @@ -564,9 +599,7 @@ private: crush.decode(cblp); // extended - ::decode(osd_up_from, p); - ::decode(osd_up_thru, p); - ::decode(osd_last_clean_interval, p); + ::decode(osd_info, p); ::decode(pg_swap_primary, p); ::decode(max_snap, p); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9a499df985030..caee8bf2d4644 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -788,17 +788,16 @@ void PG::build_prior() int crashed = 0; bool any_survived = false; for (unsigned i=0; iosdmap->get_info(interval.acting[i]); - // if the osd is not still alive (i.e. failed after this interval) and - // did not stop cleanly, then pg crashed. - // note that it is possible it shut down cleanly after the interval, but we - // do not keep full clean_thru info handy for all shutdowns, so we can't - // be sure it didn't crash, start, then stop cleanly. - pair lci = osd->osdmap->get_last_clean_interval(interval.acting[i]); - if (osd->osdmap->get_up_from(interval.acting[i]) > interval.last && - !(lci.first <= interval.first && lci.second >= interval.first)) { + // if the osd restarted after this interval but is not known to have + // cleanly survived through this interval, we mark the pg crashed. + if (pinfo.up_from > interval.last && + !(pinfo.last_clean_first <= interval.first && + pinfo.last_clean_last >= interval.last)) { dout(10) << "build_prior prior osd" << interval.acting[i] - << " went down and last clean interval " << lci.first << "-" << lci.second + << " up_from " << pinfo.up_from + << " and last clean interval " << pinfo.last_clean_first << "-" << pinfo.last_clean_last << " does not include us" << dendl; crashed++; } -- 2.39.5