osd: allow admin to mark osd lost to kickstart recovery (disk format change)

author Sage Weil <sage@newdream.net>

Wed, 26 Nov 2008 22:48:52 +0000 (14:48 -0800)

committer Sage Weil <sage@newdream.net>

Tue, 2 Dec 2008 00:31:35 +0000 (16:31 -0800)
author Sage Weil <sage@newdream.net>
Wed, 26 Nov 2008 22:48:52 +0000 (14:48 -0800)
committer Sage Weil <sage@newdream.net>
Tue, 2 Dec 2008 00:31:35 +0000 (16:31 -0800)
diff --git a/src/TODO b/src/TODO

index 0740e4043a20c4a56f4044f9d2eac67941b2c0ec..d3cd344a9db1e8e1a993d5ad44cc1afd6ac85aef 100644 (file)
--- a/src/TODO
+++ b/src/TODO
@@ -148,8 +148,8 @@ mon
  - osdmon needs to lower-bound old osdmap versions it keeps around?
  
  osd
+- pg split should be a work queue
  - pg split needs to fix up pg stats.  this is tricky with the clone overlap business...
-- how does an admin intervene when a pg needs to repeer despite a dead osd?
  - generalize ack semantics?  or just change ack from memory to journal?  memory/journal/disk...
  - rdlocks
  - optimize remove wrt recovery pushes
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h

index 16d2c7c2ba7d90ce8e0a7e61fecdd86a0545195d..fa63f2f9d2db93c1981d5cf398fe6c7496bd3aa5 100644 (file)
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -22,7 +22,7 @@
   * whenever the wire protocol changes.  try to keep this string length
   * constant.
   */
-#define CEPH_BANNER "ceph 010\n"
+#define CEPH_BANNER "ceph 011\n"
  #define CEPH_BANNER_MAX_LEN 30
  
  /*
@@ -34,8 +34,8 @@
  #define CEPH_MON_PROTOCOL    2
  #define CEPH_CLIENT_PROTOCOL 1
  
-#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v004"
-#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v003"
+#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v005"
+#define CEPH_MON_ONDISK_MAGIC "ceph mon volume v004"
  
  /*
   * types in this file are defined as little-endian, and are
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index 4b55896426153b0db5a82e392d1e4b620bf814b8..e6d3a0a4fa96b5e73e2a820fd8cbad7123bbb0c0 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -996,7 +996,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
         return true;
        }
      }
-    else if (m->cmd[1] == "down" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "down" && m->cmd.size() == 3) {
        long osd = strtol(m->cmd[2].c_str(), 0, 10);
        if (osdmap.is_down(osd)) {
         ss << "osd" << osd << " is already down";
@@ -1014,7 +1014,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
         return true;
        }
      }
-    else if (m->cmd[1] == "out" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "out" && m->cmd.size() == 3) {
        long osd = strtol(m->cmd[2].c_str(), 0, 10);
        if (osdmap.is_out(osd)) {
         ss << "osd" << osd << " is already out";
@@ -1028,7 +1028,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
         return true;
        } 
      }
-    else if (m->cmd[1] == "in" && m->cmd.size() > 2) {
+    else if (m->cmd[1] == "in" && m->cmd.size() == 3) {
        long osd = strtol(m->cmd[2].c_str(), 0, 10);
        if (osdmap.is_in(osd)) {
         ss << "osd" << osd << " is already in";
@@ -1042,7 +1042,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
         return true;
        } 
      }
-    else if (m->cmd[1] == "reweight" && m->cmd.size() > 3) {
+    else if (m->cmd[1] == "reweight" && m->cmd.size() == 4) {
        long osd = strtol(m->cmd[2].c_str(), 0, 10);
        float w = strtof(m->cmd[3].c_str(), 0);
        long ww = (int)((float)CEPH_OSD_IN*w);
@@ -1054,6 +1054,23 @@ bool OSDMonitor::prepare_command(MMonCommand *m)
         return true;
        } 
      }
+    else if (m->cmd[1] == "lost" && m->cmd.size() >= 3) {
+      long osd = strtol(m->cmd[2].c_str(), 0, 10);
+      if (m->cmd.size() < 4 ||
+         m->cmd[3] != "--yes-i-really-mean-it") {
+       ss << "are you SURE?  this might mean real, permanent data loss.  pass --yes-i-really-mean-it if you really do.";
+      }
+      else if (!osdmap.exists(osd) || !osdmap.is_down(osd)) {
+       ss << "osd" << osd << " is not down or doesn't exist";
+      } else {
+       epoch_t e = osdmap.get_info(osd).down_at;
+       pending_inc.new_lost[osd] = e;
+       ss << "marked osd lost in epoch " << e;
+       getline(ss, rs);
+       paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+       return true;
+      }
+    }
      else {
        ss << "unknown command " << m->cmd[1];
      }
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h

index f55133694b035c1bdc4bdb1102224dba08f13345..6482afeb005970de43d4d2954178b719656e61cd 100644 (file)
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -78,6 +78,15 @@ inline int calc_bits_of(int t) {
   * _finished_, or during which the osd cleanly shut down.  when
   * possible, we push this forward to the epoch the osd was eventually
   * marked down.
+ *
+ * the lost_at is used to allow build_prior to proceed without waiting
+ * for an osd to recover.  In certain cases, progress may be blocked 
+ * because an osd is down that may contain updates (i.e., a pg may have
+ * gone rw during an interval).  If the osd can't be brought online, we
+ * can force things to proceed knowing that we _might_ be losing some
+ * acked writes.  If the osd comes back to life later, that's fine to,
+ * but those writes will still be lost (the divergent objects will be
+ * thrown out).
   */
  struct osd_info_t {
    epoch_t last_clean_first;  // last interval that ended with a clean osd shutdown
@@ -85,15 +94,17 @@ struct osd_info_t {
    epoch_t up_from;   // epoch osd marked up
    epoch_t up_thru;   // lower bound on actual osd death (if > up_from)
    epoch_t down_at;   // upper bound on actual osd death (if > up_from)
+  epoch_t lost_at;   // last epoch we decided data was "lost"
    
    osd_info_t() : last_clean_first(0), last_clean_last(0),
-                up_from(0), up_thru(0), down_at(0) {}
+                up_from(0), up_thru(0), down_at(0), lost_at(0) {}
    void encode(bufferlist& bl) const {
      ::encode(last_clean_first, bl);
      ::encode(last_clean_last, bl);
      ::encode(up_from, bl);
      ::encode(up_thru, bl);
      ::encode(down_at, bl);
+    ::encode(lost_at, bl);
    }
    void decode(bufferlist::iterator& bl) {
      ::decode(last_clean_first, bl);
@@ -101,6 +112,7 @@ struct osd_info_t {
      ::decode(up_from, bl);
      ::decode(up_thru, bl);
      ::decode(down_at, bl);
+    ::decode(lost_at, bl);
    }
  };
  WRITE_CLASS_ENCODER(osd_info_t)
@@ -144,6 +156,7 @@ public:
      map<int32_t,uint32_t> new_weight;
      map<int32_t,epoch_t> new_up_thru;
      map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
+    map<int32_t,epoch_t> new_lost;
      map<pg_t,uint32_t> new_pg_swap_primary;
      list<pg_t> old_pg_swap_primary;
  
@@ -174,6 +187,7 @@ public:
        // extended
        ::encode(new_up_thru, bl);
        ::encode(new_last_clean_interval, bl);
+      ::encode(new_lost, bl);
        ::encode(new_pg_swap_primary, bl);
        ::encode(old_pg_swap_primary, bl);
        ::encode(new_max_snap, bl);
@@ -202,6 +216,7 @@ public:
        // extended
        ::decode(new_up_thru, p);
        ::decode(new_last_clean_interval, p);
+      ::decode(new_lost, p);
        ::decode(new_pg_swap_primary, p);
        ::decode(old_pg_swap_primary, p);
        ::decode(new_max_snap, p);
@@ -542,6 +557,8 @@ private:
        osd_info[i->first].last_clean_first = i->second.first;
        osd_info[i->first].last_clean_last = i->second.second;
      }
+    for (map<int32_t,epoch_t>::iterator p = inc.new_lost.begin(); p != inc.new_lost.end(); p++)
+      osd_info[p->first].lost_at = p->second;
  
      // pg swap
      for (map<pg_t,uint32_t>::iterator i = inc.new_pg_swap_primary.begin();
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 8da5a7a258523fbbea6c31cb52784fc068d0b4a2..d2df2cbfb142d358d451ddb381cbcd2beb3c48f7 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -790,6 +790,7 @@ void PG::build_prior()
      OSDMap *lastmap = osd->get_map(interval.last);
  
      int crashed = 0;
+    int need_down = 0;
      bool any_survived = false;
      for (unsigned i=0; i<interval.acting.size(); i++) {
        const osd_info_t& pinfo = osd->osdmap->get_info(interval.acting[i]);
@@ -817,10 +818,15 @@ void PG::build_prior()
         if (interval.first <= info.history.last_epoch_started &&
             interval.last >= info.history.last_epoch_started)
           any_up_now = true;
+      } else if (pinfo.lost_at > interval.first) {
+       dout(10) << "build_prior  prior osd" << interval.acting[i]
+                << " is down, but marked lost at " << pinfo.lost_at << dendl;
+       prior_set_down.insert(interval.acting[i]);
        } else {
         dout(10) << "build_prior  prior osd" << interval.acting[i]
                  << " is down, must notify mon" << dendl;
         must_notify_mon = true;
+       need_down++;
         prior_set_down.insert(interval.acting[i]);
        }
      }
@@ -828,13 +834,15 @@ void PG::build_prior()
      // if nobody survived this interval, and we may have gone rw,
      // then we need to wait for one of those osds to recover to
      // ensure that we haven't lost any information.
-    if (!any_survived && interval.maybe_went_rw) {
+    if (!any_survived && need_down && interval.maybe_went_rw) {
        // fixme: how do we identify a "clean" shutdown anyway?
-      dout(10) << "build_prior  possibly went active+rw, no survivors, including" << dendl;
+      dout(10) << "build_prior  " << need_down
+              << " osds possibly went active+rw, no survivors, including" << dendl;
        for (unsigned i=0; i<interval.acting.size(); i++)
         if (osd->osdmap->is_down(interval.acting[i])) {
           prior_set.insert(interval.acting[i]);
           prior_set_down.erase(interval.acting[i]);
+         state_set(PG_STATE_DOWN);
         }
        some_down = true;
        
@@ -851,12 +859,6 @@ void PG::build_prior()
      }
    }
  
-  if (info.history.last_epoch_started < info.history.same_since &&
-      !any_up_now) {
-    dout(10) << "build_prior  no osds are up from the last epoch started, PG is down for now." << dendl;
-    state_set(PG_STATE_DOWN);
-  }
-
    dout(10) << "build_prior = " << prior_set
            << " down = " << prior_set_down << " ..."
            << (is_crashed() ? " crashed":"")
author	Sage Weil <sage@newdream.net>
	Wed, 26 Nov 2008 22:48:52 +0000 (14:48 -0800)
committer	Sage Weil <sage@newdream.net>
	Tue, 2 Dec 2008 00:31:35 +0000 (16:31 -0800)
src/TODO		patch \| blob \| history
src/include/ceph_fs.h		patch \| blob \| history
src/mon/OSDMonitor.cc		patch \| blob \| history
src/osd/OSDMap.h		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history