From 4823b0719fe898a83e8c55f82b2f884b2df9040c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 24 Nov 2008 11:30:17 -0800
Subject: [PATCH] osd: simplify osdmap tracking of osd up/down epochs; fix pg
 build_prior logic

Use a single struct to track all of our osd up/down info.  Include
down_at, the epoch we last marked the osd down.

Fix PG::build_prior to require that the osd was clean through the _entire_
interval in question.

In monitor, adjust new clean interval foward to down_at-1 if the up_from
matches the interval we mounted.  That is, if the OSD shut down cleanly,
it obviously remained clean at least until we marked it down in the map.
---
 src/mon/OSDMonitor.cc |  25 +++++++--
 src/osd/OSDMap.h      | 125 ++++++++++++++++++++++++++----------------
 src/osd/PG.cc         |  17 +++---
 3 files changed, 108 insertions(+), 59 deletions(-)

diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 0c8b8dce45a99..1bfd256305dbe 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -499,7 +499,7 @@ bool OSDMonitor::preprocess_boot(MOSDBoot *m)
 
 bool OSDMonitor::prepare_boot(MOSDBoot *m)
 {
-  dout(7) << "prepare_boot from " << m->get_orig_source_inst() << dendl;
+  dout(7) << "prepare_boot from " << m->get_orig_source_inst() << " sb " << m->sb << dendl;
   assert(m->get_orig_source().is_osd());
   int from = m->get_orig_source().num();
   
@@ -530,9 +530,26 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m)
     if (m->sb.weight)
       osd_weight[from] = m->sb.weight;
 
-    // note last clean unmount epoch
-    pending_inc.new_last_clean_interval[from] =
-      pair<epoch_t,epoch_t>(m->sb.epoch_mounted, m->sb.epoch_unmounted);
+    // adjust last clean unmount epoch?
+    const osd_info_t& info = osdmap.get_info(from);
+    dout(10) << " old osd_info: " << info << dendl;
+    if (m->sb.epoch_mounted > info.last_clean_first ||
+	(m->sb.epoch_mounted == info.last_clean_first &&
+	 m->sb.epoch_unmounted > info.last_clean_last)) {
+      epoch_t first = m->sb.epoch_mounted;
+      epoch_t last = m->sb.epoch_unmounted;
+
+      // adjust clean interval forward to the epoch the osd was actually marked down.
+      if (info.up_from == first &&
+	  (info.down_at-1) > last)
+	last = info.down_at-1;
+
+      dout(10) << "prepare_boot osd" << from << " last_clean_interval "
+	       << info.last_clean_first << "-" << info.last_clean_last
+	       << " -> " << first << "-" << last
+	       << dendl;
+      pending_inc.new_last_clean_interval[from] = pair<epoch_t,epoch_t>(first, last);
+    }
 
     // wait
     paxos->wait_for_commit(new C_Booted(this, m));
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index bf5d9076d1acb..b196b43a2009a 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -62,6 +62,53 @@ inline int calc_bits_of(int t) {
 
 
 
+/*
+ * we track up to two intervals during which the osd was alive and
+ * healthy.  the most recent is [up_from,up_thru), where up_thru is
+ * the last epoch the osd is known to have _started_.  i.e., a lower
+ * bound on the actual osd death.  down_at (if it is > up_from) is an
+ * upper bound on the actual osd death.
+ *
+ * the second is the last_clean interval [first,last].  in that case,
+ * the last interval is the last epoch known to have been either
+ * _finished_, or during which the osd cleanly shut down.  when
+ * possible, we push this forward to the epoch the osd was eventually
+ * marked down.
+ */
+struct osd_info_t {
+  epoch_t last_clean_first;  // last interval that ended with a clean osd shutdown
+  epoch_t last_clean_last;
+  epoch_t up_from;   // epoch osd marked up
+  epoch_t up_thru;   // lower bound on actual osd death
+  epoch_t down_at;   // upper bound on actual osd death (if > up_from)
+  
+  osd_info_t() : last_clean_first(0), last_clean_last(0),
+		 up_from(0), up_thru(0), down_at(0) {}
+  void encode(bufferlist& bl) const {
+    ::encode(last_clean_first, bl);
+    ::encode(last_clean_last, bl);
+    ::encode(up_from, bl);
+    ::encode(up_thru, bl);
+    ::encode(down_at, bl);
+  }
+  void decode(bufferlist::iterator& bl) {
+    ::decode(last_clean_first, bl);
+    ::decode(last_clean_last, bl);
+    ::decode(up_from, bl);
+    ::decode(up_thru, bl);
+    ::decode(down_at, bl);
+  }
+};
+WRITE_CLASS_ENCODER(osd_info_t)
+
+inline ostream& operator<<(ostream& out, const osd_info_t& info) {
+  return out << "up_from " << info.up_from
+	     << " up_thru " << info.up_thru
+	     << " down_at " << info.down_at
+	     << " last_clean_interval " << info.last_clean_first << "-" << info.last_clean_last;
+}
+
+
 /** OSDMap
  */
 class OSDMap {
@@ -201,20 +248,7 @@ private:
   vector<uint8_t> osd_state;
   vector<entity_addr_t> osd_addr;
   vector<__u32>   osd_weight;   // 16.16 fixed point, 0x10000 = "in", 0 = "out"
-
-  /*
-   * we track up to two intervals during which the osd was alive and
-   * healthy.  the most recent is [up_from,up_thru), where up_thru is the 
-   * last epoch the osd is known to have _started_.  i.e., a lower bound on the
-   * actual osd death.
-   *
-   * the second is last_clean_interval [first,last].  in that case, the last
-   * interval is the last epoch known to have been either _finished_, or during
-   * which the osd cleanly shut down.
-   */
-  vector<epoch_t> osd_up_from;  // when it went up
-  vector<epoch_t> osd_up_thru;  // lower bound on _actual_ osd death.
-  vector<pair<epoch_t,epoch_t> > osd_last_clean_interval;
+  vector<osd_info_t> osd_info;
 
   map<pg_t,uint32_t> pg_swap_primary;  // force new osd to be pg primary (if already a member)
   snapid_t max_snap;
@@ -285,16 +319,10 @@ private:
     int o = max_osd;
     max_osd = m;
     osd_state.resize(m);
-    osd_up_from.resize(m);
-    osd_up_thru.resize(m);
-    osd_last_clean_interval.resize(m);
+    osd_info.resize(m);
     osd_weight.resize(m);
     for (; o<max_osd; o++) {
       osd_state[o] = 0;
-      osd_up_from[o] = 0;
-      osd_up_thru[o] = 0;
-      osd_last_clean_interval[o].first = 0;
-      osd_last_clean_interval[o].second = 0;
       osd_weight[o] = CEPH_OSD_OUT;
     }
     osd_addr.resize(m);
@@ -390,15 +418,19 @@ private:
 
   epoch_t get_up_from(int osd) {
     assert(exists(osd));
-    return osd_up_from[osd];
+    return osd_info[osd].up_from;
   }
   epoch_t get_up_thru(int osd) {
     assert(exists(osd));
-    return osd_up_thru[osd];
+    return osd_info[osd].up_thru;
+  }
+  epoch_t get_down_at(int osd) {
+    assert(exists(osd));
+    return osd_info[osd].down_at;
   }
-  pair<epoch_t,epoch_t> get_last_clean_interval(int osd) {
+  osd_info_t& get_info(int osd) {
     assert(exists(osd));
-    return osd_last_clean_interval[osd];
+    return osd_info[osd];
   }
   
   int get_any_up_osd() {
@@ -452,37 +484,42 @@ private:
     if (inc.new_max_osd >= 0) 
       set_max_osd(inc.new_max_osd);
 
+    for (map<int32_t,uint32_t>::iterator i = inc.new_weight.begin();
+         i != inc.new_weight.end();
+         i++)
+      set_weight(i->first, i->second);
+
+    // up/down
     for (map<int32_t,uint8_t>::iterator i = inc.new_down.begin();
          i != inc.new_down.end();
          i++) {
       assert(osd_state[i->first] & CEPH_OSD_UP);
       osd_state[i->first] &= ~CEPH_OSD_UP;
+      osd_info[i->first].down_at = epoch;
       //cout << "epoch " << epoch << " down osd" << i->first << endl;
     }
-    for (map<int32_t,uint32_t>::iterator i = inc.new_weight.begin();
-         i != inc.new_weight.end();
-         i++)
-      set_weight(i->first, i->second);
+    for (map<int32_t,entity_addr_t>::iterator i = inc.new_up.begin();
+         i != inc.new_up.end(); 
+         i++) {
+      osd_state[i->first] |= CEPH_OSD_UP;
+      osd_addr[i->first] = i->second;
+      osd_info[i->first].up_from = epoch;
+      //cout << "epoch " << epoch << " up osd" << i->first << " at " << i->second << endl;
+    }
 
+    // info
     for (map<int32_t,epoch_t>::iterator i = inc.new_up_thru.begin();
          i != inc.new_up_thru.end();
          i++)
-      osd_up_thru[i->first] = i->second;
-
+      osd_info[i->first].up_thru = i->second;
     for (map<int32_t,pair<epoch_t,epoch_t> >::iterator i = inc.new_last_clean_interval.begin();
          i != inc.new_last_clean_interval.end();
-         i++)
-      osd_last_clean_interval[i->first] = i->second;
-
-    for (map<int32_t,entity_addr_t>::iterator i = inc.new_up.begin();
-         i != inc.new_up.end(); 
          i++) {
-      osd_state[i->first] |= CEPH_OSD_UP;
-      osd_addr[i->first] = i->second;
-      osd_up_from[i->first] = epoch;
-      //cout << "epoch " << epoch << " up osd" << i->first << " at " << i->second << endl;
+      osd_info[i->first].last_clean_first = i->second.first;
+      osd_info[i->first].last_clean_last = i->second.second;
     }
 
+    // pg swap
     for (map<pg_t,uint32_t>::iterator i = inc.new_pg_swap_primary.begin();
 	 i != inc.new_pg_swap_primary.end();
 	 i++)
@@ -528,9 +565,7 @@ private:
     ::encode(cbl, blist);
 
     // extended
-    ::encode(osd_up_from, blist);
-    ::encode(osd_up_thru, blist);
-    ::encode(osd_last_clean_interval, blist);
+    ::encode(osd_info, blist);
     ::encode(pg_swap_primary, blist);
 
     ::encode(max_snap, blist);
@@ -564,9 +599,7 @@ private:
     crush.decode(cblp);
 
     // extended
-    ::decode(osd_up_from, p);
-    ::decode(osd_up_thru, p);
-    ::decode(osd_last_clean_interval, p);
+    ::decode(osd_info, p);
     ::decode(pg_swap_primary, p);
     
     ::decode(max_snap, p);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 9a499df985030..caee8bf2d4644 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -788,17 +788,16 @@ void PG::build_prior()
     int crashed = 0;
     bool any_survived = false;
     for (unsigned i=0; i<interval.acting.size(); i++) {
+      const osd_info_t& pinfo = osd->osdmap->get_info(interval.acting[i]);
 
-      // if the osd is not still alive (i.e. failed after this interval) and 
-      // did not stop cleanly, then pg crashed.  
-      // note that it is possible it shut down cleanly after the interval, but we
-      // do not keep full clean_thru info handy for all shutdowns, so we can't
-      // be sure it didn't crash, start, then stop cleanly.
-      pair<epoch_t,epoch_t> lci = osd->osdmap->get_last_clean_interval(interval.acting[i]);
-      if (osd->osdmap->get_up_from(interval.acting[i]) > interval.last &&
-	  !(lci.first <= interval.first && lci.second >= interval.first)) {
+      // if the osd restarted after this interval but is not known to have
+      // cleanly survived through this interval, we mark the pg crashed.
+      if (pinfo.up_from > interval.last &&
+	  !(pinfo.last_clean_first <= interval.first &&
+	    pinfo.last_clean_last >= interval.last)) {
 	dout(10) << "build_prior  prior osd" << interval.acting[i]
-		 << " went down and last clean interval " << lci.first << "-" << lci.second
+		 << " up_from " << pinfo.up_from
+		 << " and last clean interval " << pinfo.last_clean_first << "-" << pinfo.last_clean_last
 		 << " does not include us" << dendl;
 	crashed++;
       }
-- 
2.39.5