From: Sage Weil <sage@redhat.com>
Date: Thu, 10 Aug 2017 18:06:02 +0000 (-0400)
Subject: mgr: implement 'osd safe-to-destroy' and 'ok-to-stop' commands
X-Git-Tag: v12.1.4~2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=6808af486d3bdce966232810004648502c967b13;p=ceph.git

mgr: implement 'osd safe-to-destroy' and 'ok-to-stop' commands

An osd is safe to destroy if

- we have osd_stat for it
- osd_stat indicates no pgs stored
- all pgs are known
- no pgs map to it

An osd is ok ot stop if

- we have pg stats
- no pgs will drop below min_size

Signed-off-by: Sage Weil <sage@redhat.com>
(cherry picked from commit bf9380457bba5a834ffa2927c73165e0f1960332)
---

diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index 0f372d9714be5..a95b4a7e28358 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -39,7 +39,7 @@ Synopsis
 
 | **ceph** **mon_status**
 
-| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
+| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *ok-to-stop* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *safe-to-destroy* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
 
 | **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ...
 
@@ -874,6 +874,18 @@ Usage::
 
 	ceph osd out <ids> [<ids>...]
 
+Subcommand ``ok-to-stop`` checks whether the list of OSD(s) can be
+stopped without immediately making data unavailable.  That is, all
+data should remain readable and writeable, although data redundancy
+may be reduced as some PGs may end up in a degraded (but active)
+state.  It will return a success code if it is okay to stop the
+OSD(s), or an error code and informative message if it is not or if no
+conclusion can be drawn at the current time.
+
+Usage::
+
+  ceph osd ok-to-stop <id> [<ids>...]
+
 Subcommand ``pause`` pauses osd.
 
 Usage::
@@ -1066,6 +1078,16 @@ Usage::
 
     ceph osd purge <id> {--yes-i-really-mean-it}
 
+Subcommand ``safe-to-destroy`` checks whether it is safe to remove or
+destroy an OSD without reducing overall data redundancy or durability.
+It will return a success code if it is definitely safe, or an error
+code and informative message if it is not or if no conclusion can be
+drawn at the current time.
+
+Usage::
+
+  ceph osd safe-to-destroy <id> [<ids>...]
+
 Subcommand ``scrub`` initiates scrub on specified osd.
 
 Usage::
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index db0dbbc8b049b..76a8977806dba 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -210,6 +210,11 @@ Major Changes from Kraken
     - ``ceph osd {add,rm}-{noout,noin,nodown,noup}`` allow the
       `noout`, `noin`, `nodown`, and `noup` flags to be applied to
       specific OSDs.
+    - ``ceph osd safe-to-destroy <osd(s)>`` will report whether it is safe to
+      remove or destroy OSD(s) without reducing data durability or redundancy.
+    - ``ceph osd ok-to-stop <osd(s)>`` will report whether it is okay to stop
+      OSD(s) without immediately compromising availability (i.e., all PGs
+      should remain active but may be degraded).
     - ``ceph log last [n]`` will output the last *n* lines of the cluster
       log.
     - ``ceph mgr dump`` will dump the MgrMap, including the currently active
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index a58675ed24c07..34aac18718188 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -924,6 +924,163 @@ bool DaemonServer::handle_command(MCommand *m)
       });
     cmdctx->reply(r, "");
     return true;
+  } else if (prefix == "osd safe-to-destroy") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    set<int> active_osds, missing_stats, stored_pgs;
+    int affected_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+	if (pg_map.num_pg_unknown > 0) {
+	  ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
+	     << " any conclusions";
+	  r = -EAGAIN;
+	  return;
+	}
+	int num_active_clean = 0;
+	for (auto& p : pg_map.num_pg_by_state) {
+	  unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
+	  if ((p.first & want) == want) {
+	    num_active_clean += p.second;
+	  }
+	}
+	cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	    for (auto osd : osds) {
+	      if (!osdmap.exists(osd)) {
+		continue;  // clearly safe to destroy
+	      }
+	      auto q = pg_map.num_pg_by_osd.find(osd);
+	      if (q != pg_map.num_pg_by_osd.end()) {
+		if (q->second.acting > 0 || q->second.up > 0) {
+		  active_osds.insert(osd);
+		  affected_pgs += q->second.acting + q->second.up;
+		  continue;
+		}
+	      }
+	      if (num_active_clean < pg_map.num_pg) {
+		// all pgs aren't active+clean; we need to be careful.
+		auto p = pg_map.osd_stat.find(osd);
+		if (p == pg_map.osd_stat.end()) {
+		  missing_stats.insert(osd);
+		}
+		if (p->second.num_pgs > 0) {
+		  stored_pgs.insert(osd);
+		}
+	      }
+	    }
+	  });
+      });
+    if (!r && !active_osds.empty()) {
+      ss << "OSD(s) " << active_osds << " have " << affected_pgs
+	 << " pgs currently mapped to them";
+      r = -EBUSY;
+    } else if (!missing_stats.empty()) {
+      ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
+	 << " PGs are active+clean; we cannot draw any conclusions";
+      r = -EAGAIN;
+    } else if (!stored_pgs.empty()) {
+      ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
+	 << " data, and not all PGs are active+clean; we cannot be sure they"
+	 << " aren't still needed.";
+      r = -EBUSY;
+    }
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
+       << " durability.";
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "osd ok-to-stop") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    map<pg_t,int> pg_delta;  // pgid -> net acting set size change
+    int dangerous_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+	return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+	    if (pg_map.num_pg_unknown > 0) {
+	      ss << pg_map.num_pg_unknown << " pgs have unknown state; "
+		 << "cannot draw any conclusions";
+	      r = -EAGAIN;
+	      return;
+	    }
+	    for (auto osd : osds) {
+	      auto p = pg_map.pg_by_osd.find(osd);
+	      if (p != pg_map.pg_by_osd.end()) {
+		for (auto& pgid : p->second) {
+		  --pg_delta[pgid];
+		}
+	      }
+	    }
+	    for (auto& p : pg_delta) {
+	      auto q = pg_map.pg_stat.find(p.first);
+	      if (q == pg_map.pg_stat.end()) {
+		ss << "missing information about " << p.first << "; cannot draw"
+		   << " any conclusions";
+		r = -EAGAIN;
+		return;
+	      }
+	      if (!(q->second.state & PG_STATE_ACTIVE) ||
+		  (q->second.state & PG_STATE_DEGRADED)) {
+		// we don't currently have a good way to tell *how* degraded
+		// a degraded PG is, so we have to assume we cannot remove
+		// any more replicas/shards.
+		++dangerous_pgs;
+		continue;
+	      }
+	      const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
+	      if (!pi) {
+		++dangerous_pgs; // pool is creating or deleting
+	      } else {
+		if (q->second.acting.size() + p.second < pi->min_size) {
+		  ++dangerous_pgs;
+		}
+	      }
+	    }
+	  });
+      });
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    if (dangerous_pgs) {
+      ss << dangerous_pgs << " PGs are already degraded or might become "
+	 << "unavailable";
+      cmdctx->reply(-EBUSY, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are ok to stop without reducing"
+       << " availability, provided there are no other concurrent failures"
+       << " or interventions. " << pg_delta.size() << " PGs are likely to be"
+       << " degraded (but remain available) as a result.";
+    cmdctx->reply(0, ss);
+    return true;
   } else if (prefix == "pg force-recovery" ||
   	       prefix == "pg force-backfill" ||
   	       prefix == "pg cancel-force-recovery" ||
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h
index e63bfc48c79bc..1818454e1fcc4 100644
--- a/src/mgr/MgrCommands.h
+++ b/src/mgr/MgrCommands.h
@@ -107,6 +107,13 @@ COMMAND("osd test-reweight-by-pg " \
 	"dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
 	"osd", "r", "cli,rest")
 
+COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N",
+	"check whether osd(s) can be safely destroyed without reducing data durability",
+	"osd", "r", "cli,rest")
+COMMAND("osd ok-to-stop name=ids,type=CephString,n=N",
+	"check whether osd(s) can be safely stopped without reducing immediate"\
+	" data availability", "osd", "r", "cli,rest")
+
 COMMAND("osd scrub " \
 	"name=who,type=CephString", \
 	"initiate scrub on osd <who>, or use <all|any|*> to scrub all", \