From 95d6947a1d8ec9a7a6806394ff239071837d6702 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 26 Nov 2008 14:48:52 -0800
Subject: [PATCH] osd: allow admin to mark osd lost to kickstart recovery (disk
 format change)

This is important when an osd (or osds) may contain modifications
but is offline.  If the data is truly lost, we can kickstart
recovery.

Note that if the osd was storing metadata, this could be
especially dangerous!
---
 src/TODO              |  2 +-
 src/include/ceph_fs.h |  6 +++---
 src/mon/OSDMonitor.cc | 25 +++++++++++++++++++++----
 src/osd/OSDMap.h      | 19 ++++++++++++++++++-
 src/osd/PG.cc         | 18 ++++++++++--------
 5 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/src/TODO b/src/TODO
index 0740e4043a20c..d3cd344a9db1e 100644
--- a/src/TODO
+++ b/src/TODO
@@ -148,8 +148,8 @@ mon
 - osdmon needs to lower-bound old osdmap versions it keeps around?
 
 osd
+- pg split should be a work queue
 - pg split needs to fix up pg stats.  this is tricky with the clone overlap business...
-- how does an admin intervene when a pg needs to repeer despite a dead osd?
 - generalize ack semantics?  or just change ack from memory to journal?  memory/journal/disk...
 - rdlocks
 - optimize remove wrt recovery pushes
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 16d2c7c2ba7d9..fa63f2f9d2db9 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -22,7 +22,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph 010\n"
+#define CEPH_BANNER "ceph 011\n"
 #define CEPH_BANNER_MAX_LEN 30
 
 /*
@@ -34,8 +34,8 @@
 #define CEPH_MON_PROTOCOL    2
 #define CEPH_CLIENT_PROTOCOL 1
 
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v004"
-#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v003"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v005"
+#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v004"
 
 /*
  * types in this file are defined as little-endian, and are
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 4b55896426153..e6d3a0a4fa96b 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -996,7 +996,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       }
     }
-    else if (m->cmd[1] == "down" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "down" && m->cmd.size() == 3) {
       long osd = strtol(m->cmd[2].c_str(), 0, 10);
       if (osdmap.is_down(osd)) {
 	ss << "osd" << osd << " is already down";
@@ -1014,7 +1014,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       }
     }
-    else if (m->cmd[1] == "out" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "out" && m->cmd.size() == 3) {
       long osd = strtol(m->cmd[2].c_str(), 0, 10);
       if (osdmap.is_out(osd)) {
 	ss << "osd" << osd << " is already out";
@@ -1028,7 +1028,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       } 
     }
-    else if (m->cmd[1] == "in" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "in" && m->cmd.size() == 3) {
       long osd = strtol(m->cmd[2].c_str(), 0, 10);
       if (osdmap.is_in(osd)) {
 	ss << "osd" << osd << " is already in";
@@ -1042,7 +1042,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       } 
     }
-    else if (m->cmd[1] == "reweight" && m->cmd.size() > 3) {
+    else if (m->cmd[1] == "reweight" && m->cmd.size() == 4) {
       long osd = strtol(m->cmd[2].c_str(), 0, 10);
       float w = strtof(m->cmd[3].c_str(), 0);
       long ww = (int)((float)CEPH_OSD_IN*w);
@@ -1054,6 +1054,23 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
 	return true;
       } 
     }
+    else if (m->cmd[1] == "lost" && m->cmd.size() >= 3) {
+      long osd = strtol(m->cmd[2].c_str(), 0, 10);
+      if (m->cmd.size() < 4 ||
+	  m->cmd[3] != "--yes-i-really-mean-it") {
+	ss << "are you SURE?  this might mean real, permanent data loss.  pass --yes-i-really-mean-it if you really do.";
+      }
+      else if (!osdmap.exists(osd) || !osdmap.is_down(osd)) {
+	ss << "osd" << osd << " is not down or doesn't exist";
+      } else {
+	epoch_t e = osdmap.get_info(osd).down_at;
+	pending_inc.new_lost[osd] = e;
+	ss << "marked osd lost in epoch " << e;
+	getline(ss, rs);
+	paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+	return true;
+      }
+    }
     else {
       ss << "unknown command " << m->cmd[1];
     }
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index f55133694b035..6482afeb00597 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -78,6 +78,15 @@ inline int calc_bits_of(int t) {
  * _finished_, or during which the osd cleanly shut down.  when
  * possible, we push this forward to the epoch the osd was eventually
  * marked down.
+ *
+ * the lost_at is used to allow build_prior to proceed without waiting
+ * for an osd to recover.  In certain cases, progress may be blocked 
+ * because an osd is down that may contain updates (i.e., a pg may have
+ * gone rw during an interval).  If the osd can't be brought online, we
+ * can force things to proceed knowing that we _might_ be losing some
+ * acked writes.  If the osd comes back to life later, that's fine to,
+ * but those writes will still be lost (the divergent objects will be
+ * thrown out).
  */
 struct osd_info_t {
   epoch_t last_clean_first;  // last interval that ended with a clean osd shutdown
@@ -85,15 +94,17 @@ struct osd_info_t {
   epoch_t up_from;   // epoch osd marked up
   epoch_t up_thru;   // lower bound on actual osd death (if > up_from)
   epoch_t down_at;   // upper bound on actual osd death (if > up_from)
+  epoch_t lost_at;   // last epoch we decided data was "lost"
   
   osd_info_t() : last_clean_first(0), last_clean_last(0),
-		 up_from(0), up_thru(0), down_at(0) {}
+		 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
   void encode(bufferlist& bl) const {
     ::encode(last_clean_first, bl);
     ::encode(last_clean_last, bl);
     ::encode(up_from, bl);
     ::encode(up_thru, bl);
     ::encode(down_at, bl);
+    ::encode(lost_at, bl);
   }
   void decode(bufferlist::iterator& bl) {
     ::decode(last_clean_first, bl);
@@ -101,6 +112,7 @@ struct osd_info_t {
     ::decode(up_from, bl);
     ::decode(up_thru, bl);
     ::decode(down_at, bl);
+    ::decode(lost_at, bl);
   }
 };
 WRITE_CLASS_ENCODER(osd_info_t)
@@ -144,6 +156,7 @@ public:
     map<int32_t,uint32_t> new_weight;
     map<int32_t,epoch_t> new_up_thru;
     map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
+    map<int32_t,epoch_t> new_lost;
     map<pg_t,uint32_t> new_pg_swap_primary;
     list<pg_t> old_pg_swap_primary;
 
@@ -174,6 +187,7 @@ public:
       // extended
       ::encode(new_up_thru, bl);
       ::encode(new_last_clean_interval, bl);
+      ::encode(new_lost, bl);
       ::encode(new_pg_swap_primary, bl);
       ::encode(old_pg_swap_primary, bl);
       ::encode(new_max_snap, bl);
@@ -202,6 +216,7 @@ public:
       // extended
       ::decode(new_up_thru, p);
       ::decode(new_last_clean_interval, p);
+      ::decode(new_lost, p);
       ::decode(new_pg_swap_primary, p);
       ::decode(old_pg_swap_primary, p);
       ::decode(new_max_snap, p);
@@ -542,6 +557,8 @@ private:
       osd_info[i->first].last_clean_first = i->second.first;
       osd_info[i->first].last_clean_last = i->second.second;
     }
+    for (map<int32_t,epoch_t>::iterator p = inc.new_lost.begin(); p != inc.new_lost.end(); p++)
+      osd_info[p->first].lost_at = p->second;
 
     // pg swap
     for (map<pg_t,uint32_t>::iterator i = inc.new_pg_swap_primary.begin();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 8da5a7a258523..d2df2cbfb142d 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -790,6 +790,7 @@ void PG::build_prior()
     OSDMap *lastmap = osd->get_map(interval.last);
 
     int crashed = 0;
+    int need_down = 0;
     bool any_survived = false;
     for (unsigned i=0; i<interval.acting.size(); i++) {
       const osd_info_t& pinfo = osd->osdmap->get_info(interval.acting[i]);
@@ -817,10 +818,15 @@ void PG::build_prior()
 	if (interval.first <= info.history.last_epoch_started &&
 	    interval.last >= info.history.last_epoch_started)
 	  any_up_now = true;
+      } else if (pinfo.lost_at > interval.first) {
+	dout(10) << "build_prior  prior osd" << interval.acting[i]
+		 << " is down, but marked lost at " << pinfo.lost_at << dendl;
+	prior_set_down.insert(interval.acting[i]);
       } else {
 	dout(10) << "build_prior  prior osd" << interval.acting[i]
 		 << " is down, must notify mon" << dendl;
 	must_notify_mon = true;
+	need_down++;
 	prior_set_down.insert(interval.acting[i]);
       }
     }
@@ -828,13 +834,15 @@ void PG::build_prior()
     // if nobody survived this interval, and we may have gone rw,
     // then we need to wait for one of those osds to recover to
     // ensure that we haven't lost any information.
-    if (!any_survived && interval.maybe_went_rw) {
+    if (!any_survived && need_down && interval.maybe_went_rw) {
       // fixme: how do we identify a "clean" shutdown anyway?
-      dout(10) << "build_prior  possibly went active+rw, no survivors, including" << dendl;
+      dout(10) << "build_prior  " << need_down
+	       << " osds possibly went active+rw, no survivors, including" << dendl;
       for (unsigned i=0; i<interval.acting.size(); i++)
 	if (osd->osdmap->is_down(interval.acting[i])) {
 	  prior_set.insert(interval.acting[i]);
 	  prior_set_down.erase(interval.acting[i]);
+	  state_set(PG_STATE_DOWN);
 	}
       some_down = true;
       
@@ -851,12 +859,6 @@ void PG::build_prior()
     }
   }
 
-  if (info.history.last_epoch_started < info.history.same_since &&
-      !any_up_now) {
-    dout(10) << "build_prior  no osds are up from the last epoch started, PG is down for now." << dendl;
-    state_set(PG_STATE_DOWN);
-  }
-
   dout(10) << "build_prior = " << prior_set
 	   << " down = " << prior_set_down << " ..."
 	   << (is_crashed() ? " crashed":"")
-- 
2.39.5