mgr: implement 'osd safe-to-destroy' and 'ok-to-stop' commands

author Sage Weil <sage@redhat.com>

Thu, 10 Aug 2017 18:06:02 +0000 (14:06 -0400)

committer Sage Weil <sage@redhat.com>

Sun, 13 Aug 2017 19:03:41 +0000 (15:03 -0400)
author Sage Weil <sage@redhat.com>
Thu, 10 Aug 2017 18:06:02 +0000 (14:06 -0400)
committer Sage Weil <sage@redhat.com>
Sun, 13 Aug 2017 19:03:41 +0000 (15:03 -0400)
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst

index 0f372d9714be5ee6498ab466dc4f1940bc677aa8..a95b4a7e283582ba2939e585f9274440c0fe1150 100644 (file)
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -39,7 +39,7 @@ Synopsis
  
  | **ceph** **mon_status**
  
-| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
+| **ceph** **osd** [ *blacklist* \| *blocked-by* \| *create* \| *new* \| *deep-scrub* \| *df* \| *down* \| *dump* \| *erasure-code-profile* \| *find* \| *getcrushmap* \| *getmap* \| *getmaxosd* \| *in* \| *lspools* \| *map* \| *metadata* \| *ok-to-stop* \| *out* \| *pause* \| *perf* \| *pg-temp* \| *force-create-pg* \| *primary-affinity* \| *primary-temp* \| *repair* \| *reweight* \| *reweight-by-pg* \| *rm* \| *destroy* \| *purge* \| *safe-to-destroy* \| *scrub* \| *set* \| *setcrushmap* \| *setmaxosd*  \| *stat* \| *tree* \| *unpause* \| *unset* ] ...
  
  | **ceph** **osd** **crush** [ *add* \| *add-bucket* \| *create-or-move* \| *dump* \| *get-tunable* \| *link* \| *move* \| *remove* \| *rename-bucket* \| *reweight* \| *reweight-all* \| *reweight-subtree* \| *rm* \| *rule* \| *set* \| *set-tunable* \| *show-tunables* \| *tunables* \| *unlink* ] ...
  
@@ -874,6 +874,18 @@ Usage::
  
         ceph osd out <ids> [<ids>...]
  
+Subcommand ``ok-to-stop`` checks whether the list of OSD(s) can be
+stopped without immediately making data unavailable.  That is, all
+data should remain readable and writeable, although data redundancy
+may be reduced as some PGs may end up in a degraded (but active)
+state.  It will return a success code if it is okay to stop the
+OSD(s), or an error code and informative message if it is not or if no
+conclusion can be drawn at the current time.
+
+Usage::
+
+  ceph osd ok-to-stop <id> [<ids>...]
+
  Subcommand ``pause`` pauses osd.
  
  Usage::
@@ -1066,6 +1078,16 @@ Usage::
  
      ceph osd purge <id> {--yes-i-really-mean-it}
  
+Subcommand ``safe-to-destroy`` checks whether it is safe to remove or
+destroy an OSD without reducing overall data redundancy or durability.
+It will return a success code if it is definitely safe, or an error
+code and informative message if it is not or if no conclusion can be
+drawn at the current time.
+
+Usage::
+
+  ceph osd safe-to-destroy <id> [<ids>...]
+
  Subcommand ``scrub`` initiates scrub on specified osd.
  
  Usage::
diff --git a/doc/release-notes.rst b/doc/release-notes.rst

index db0dbbc8b049b9c917007dc0665c822b4946486a..76a8977806dbae19c77f9fc50bc3b825c29a8697 100644 (file)
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -210,6 +210,11 @@ Major Changes from Kraken
      - ``ceph osd {add,rm}-{noout,noin,nodown,noup}`` allow the
        `noout`, `noin`, `nodown`, and `noup` flags to be applied to
        specific OSDs.
+    - ``ceph osd safe-to-destroy <osd(s)>`` will report whether it is safe to
+      remove or destroy OSD(s) without reducing data durability or redundancy.
+    - ``ceph osd ok-to-stop <osd(s)>`` will report whether it is okay to stop
+      OSD(s) without immediately compromising availability (i.e., all PGs
+      should remain active but may be degraded).
      - ``ceph log last [n]`` will output the last *n* lines of the cluster
        log.
      - ``ceph mgr dump`` will dump the MgrMap, including the currently active
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc

index a58675ed24c071b5e38ad48db43958f59fda8e7e..34aac187181889e22e5131a9a5ad6d34eb127f24 100644 (file)
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -924,6 +924,163 @@ bool DaemonServer::handle_command(MCommand *m)
        });
      cmdctx->reply(r, "");
      return true;
+  } else if (prefix == "osd safe-to-destroy") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+       r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    set<int> active_osds, missing_stats, stored_pgs;
+    int affected_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+       if (pg_map.num_pg_unknown > 0) {
+         ss << pg_map.num_pg_unknown << " pgs have unknown state; cannot draw"
+            << " any conclusions";
+         r = -EAGAIN;
+         return;
+       }
+       int num_active_clean = 0;
+       for (auto& p : pg_map.num_pg_by_state) {
+         unsigned want = PG_STATE_ACTIVE|PG_STATE_CLEAN;
+         if ((p.first & want) == want) {
+           num_active_clean += p.second;
+         }
+       }
+       cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+           for (auto osd : osds) {
+             if (!osdmap.exists(osd)) {
+               continue;  // clearly safe to destroy
+             }
+             auto q = pg_map.num_pg_by_osd.find(osd);
+             if (q != pg_map.num_pg_by_osd.end()) {
+               if (q->second.acting > 0 || q->second.up > 0) {
+                 active_osds.insert(osd);
+                 affected_pgs += q->second.acting + q->second.up;
+                 continue;
+               }
+             }
+             if (num_active_clean < pg_map.num_pg) {
+               // all pgs aren't active+clean; we need to be careful.
+               auto p = pg_map.osd_stat.find(osd);
+               if (p == pg_map.osd_stat.end()) {
+                 missing_stats.insert(osd);
+               }
+               if (p->second.num_pgs > 0) {
+                 stored_pgs.insert(osd);
+               }
+             }
+           }
+         });
+      });
+    if (!r && !active_osds.empty()) {
+      ss << "OSD(s) " << active_osds << " have " << affected_pgs
+        << " pgs currently mapped to them";
+      r = -EBUSY;
+    } else if (!missing_stats.empty()) {
+      ss << "OSD(s) " << missing_stats << " have no reported stats, and not all"
+        << " PGs are active+clean; we cannot draw any conclusions";
+      r = -EAGAIN;
+    } else if (!stored_pgs.empty()) {
+      ss << "OSD(s) " << stored_pgs << " last reported they still store some PG"
+        << " data, and not all PGs are active+clean; we cannot be sure they"
+        << " aren't still needed.";
+      r = -EBUSY;
+    }
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
+       << " durability.";
+    cmdctx->reply(0, ss);
+    return true;
+  } else if (prefix == "osd ok-to-stop") {
+    vector<string> ids;
+    cmd_getval(g_ceph_context, cmdctx->cmdmap, "ids", ids);
+    set<int> osds;
+    int r;
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+       r = osdmap.parse_osd_id_list(ids, &osds, &ss);
+      });
+    if (!r && osds.empty()) {
+      ss << "must specify one or more OSDs";
+      r = -EINVAL;
+    }
+    if (r < 0) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    map<pg_t,int> pg_delta;  // pgid -> net acting set size change
+    int dangerous_pgs = 0;
+    cluster_state.with_pgmap([&](const PGMap& pg_map) {
+       return cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+           if (pg_map.num_pg_unknown > 0) {
+             ss << pg_map.num_pg_unknown << " pgs have unknown state; "
+                << "cannot draw any conclusions";
+             r = -EAGAIN;
+             return;
+           }
+           for (auto osd : osds) {
+             auto p = pg_map.pg_by_osd.find(osd);
+             if (p != pg_map.pg_by_osd.end()) {
+               for (auto& pgid : p->second) {
+                 --pg_delta[pgid];
+               }
+             }
+           }
+           for (auto& p : pg_delta) {
+             auto q = pg_map.pg_stat.find(p.first);
+             if (q == pg_map.pg_stat.end()) {
+               ss << "missing information about " << p.first << "; cannot draw"
+                  << " any conclusions";
+               r = -EAGAIN;
+               return;
+             }
+             if (!(q->second.state & PG_STATE_ACTIVE) ||
+                 (q->second.state & PG_STATE_DEGRADED)) {
+               // we don't currently have a good way to tell *how* degraded
+               // a degraded PG is, so we have to assume we cannot remove
+               // any more replicas/shards.
+               ++dangerous_pgs;
+               continue;
+             }
+             const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
+             if (!pi) {
+               ++dangerous_pgs; // pool is creating or deleting
+             } else {
+               if (q->second.acting.size() + p.second < pi->min_size) {
+                 ++dangerous_pgs;
+               }
+             }
+           }
+         });
+      });
+    if (r) {
+      cmdctx->reply(r, ss);
+      return true;
+    }
+    if (dangerous_pgs) {
+      ss << dangerous_pgs << " PGs are already degraded or might become "
+        << "unavailable";
+      cmdctx->reply(-EBUSY, ss);
+      return true;
+    }
+    ss << "OSD(s) " << osds << " are ok to stop without reducing"
+       << " availability, provided there are no other concurrent failures"
+       << " or interventions. " << pg_delta.size() << " PGs are likely to be"
+       << " degraded (but remain available) as a result.";
+    cmdctx->reply(0, ss);
+    return true;
    } else if (prefix == "pg force-recovery" ||
                prefix == "pg force-backfill" ||
                prefix == "pg cancel-force-recovery" ||
diff --git a/src/mgr/MgrCommands.h b/src/mgr/MgrCommands.h

index e63bfc48c79bc3c93f3836a30cf1a9e24439887a..1818454e1fcc47df9a5ded7323c5a9c59d5cd55d 100644 (file)
--- a/src/mgr/MgrCommands.h
+++ b/src/mgr/MgrCommands.h
@@ -107,6 +107,13 @@ COMMAND("osd test-reweight-by-pg " \
         "dry run of reweight OSDs by PG distribution [overload-percentage-for-consideration, default 120]", \
         "osd", "r", "cli,rest")
  
+COMMAND("osd safe-to-destroy name=ids,type=CephString,n=N",
+       "check whether osd(s) can be safely destroyed without reducing data durability",
+       "osd", "r", "cli,rest")
+COMMAND("osd ok-to-stop name=ids,type=CephString,n=N",
+       "check whether osd(s) can be safely stopped without reducing immediate"\
+       " data availability", "osd", "r", "cli,rest")
+
  COMMAND("osd scrub " \
         "name=who,type=CephString", \
         "initiate scrub on osd <who>, or use <all|any|*> to scrub all", \
author	Sage Weil <sage@redhat.com>
	Thu, 10 Aug 2017 18:06:02 +0000 (14:06 -0400)
committer	Sage Weil <sage@redhat.com>
	Sun, 13 Aug 2017 19:03:41 +0000 (15:03 -0400)
doc/man/8/ceph.rst		patch \| blob \| history
doc/release-notes.rst		patch \| blob \| history
src/mgr/DaemonServer.cc		patch \| blob \| history
src/mgr/MgrCommands.h		patch \| blob \| history