]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr: Improve ok-to-stop by using the avail_no_missing for recovery
authorDavid Zafman <dzafman@redhat.com>
Sat, 13 Apr 2019 18:32:19 +0000 (11:32 -0700)
committerSmith Farm <smithfarm@vanguard2.suse.cz>
Tue, 30 Apr 2019 15:57:41 +0000 (17:57 +0200)
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 9750061d5d4236aaba156d60790e0b8bcd7cfb64)

src/mgr/DaemonServer.cc

index 38143ecc227f8c36e1c0f625919539018840eca8..e9a868f0abc4817043465b215162c318f2cb0991 100644 (file)
@@ -1509,7 +1509,7 @@ bool DaemonServer::_handle_command(
       cmdctx->reply(r, ss);
       return true;
     }
-    map<pg_t,int> pg_delta;  // pgid -> net acting set size change
+    int touched_pgs = 0;
     int dangerous_pgs = 0;
     cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
        if (pg_map.num_pg_unknown > 0) {
@@ -1518,35 +1518,40 @@ bool DaemonServer::_handle_command(
          r = -EAGAIN;
          return;
        }
-       for (auto osd : osds) {
-         auto p = pg_map.pg_by_osd.find(osd);
-         if (p != pg_map.pg_by_osd.end()) {
-           for (auto& pgid : p->second) {
-             --pg_delta[pgid];
+       for (const auto& q : pg_map.pg_stat) {
+          set<int32_t> pg_acting;  // net acting sets (with no missing if degraded)
+         bool found = false;
+         if (q.second.state & PG_STATE_DEGRADED) {
+           for (auto& anm : q.second.avail_no_missing) {
+             if (osds.count(anm.osd)) {
+               found = true;
+               continue;
+             }
+             pg_acting.insert(anm.osd);
+           }
+         } else {
+           for (auto& a : q.second.acting) {
+             if (osds.count(a)) {
+               found = true;
+               continue;
+             }
+             pg_acting.insert(a);
            }
          }
-       }
-       for (auto& p : pg_delta) {
-         auto q = pg_map.pg_stat.find(p.first);
-         if (q == pg_map.pg_stat.end()) {
-           ss << "missing information about " << p.first << "; cannot draw"
-              << " any conclusions";
-           r = -EAGAIN;
-           return;
+         if (!found) {
+           continue;
          }
-         if (!(q->second.state & PG_STATE_ACTIVE) ||
-             (q->second.state & PG_STATE_DEGRADED)) {
-           // we don't currently have a good way to tell *how* degraded
-           // a degraded PG is, so we have to assume we cannot remove
-           // any more replicas/shards.
+         touched_pgs++;
+         if (!(q.second.state & PG_STATE_ACTIVE) ||
+             (q.second.state & PG_STATE_DEGRADED)) {
            ++dangerous_pgs;
            continue;
          }
-         const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
+         const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
          if (!pi) {
            ++dangerous_pgs; // pool is creating or deleting
          } else {
-           if (q->second.acting.size() + p.second < pi->min_size) {
+           if (pg_acting.size() < pi->min_size) {
              ++dangerous_pgs;
            }
          }
@@ -1557,14 +1562,15 @@ bool DaemonServer::_handle_command(
       return true;
     }
     if (dangerous_pgs) {
-      ss << dangerous_pgs << " PGs are already degraded or might become "
-        << "unavailable";
+      ss << dangerous_pgs << " PGs are already too degraded, would become"
+        << " too degraded or might become unavailable";
       cmdctx->reply(-EBUSY, ss);
       return true;
     }
     ss << "OSD(s) " << osds << " are ok to stop without reducing"
-       << " availability, provided there are no other concurrent failures"
-       << " or interventions. " << pg_delta.size() << " PGs are likely to be"
+       << " availability or risking data, provided there are no other concurrent failures"
+       << " or interventions." << std::endl;
+    ss << touched_pgs << " PGs are likely to be"
        << " degraded (but remain available) as a result.";
     cmdctx->reply(0, ss);
     return true;