From 9750061d5d4236aaba156d60790e0b8bcd7cfb64 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Sat, 13 Apr 2019 11:32:19 -0700 Subject: [PATCH] mgr: Improve ok-to-stop by using the avail_no_missing for recovery Signed-off-by: David Zafman --- src/mgr/DaemonServer.cc | 56 +++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index 0c47cfcd2ee..141d9b97761 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -1505,7 +1505,7 @@ bool DaemonServer::_handle_command( cmdctx->reply(r, ss); return true; } - map pg_delta; // pgid -> net acting set size change + int touched_pgs = 0; int dangerous_pgs = 0; cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) { if (pg_map.num_pg_unknown > 0) { @@ -1514,35 +1514,40 @@ bool DaemonServer::_handle_command( r = -EAGAIN; return; } - for (auto osd : osds) { - auto p = pg_map.pg_by_osd.find(osd); - if (p != pg_map.pg_by_osd.end()) { - for (auto& pgid : p->second) { - --pg_delta[pgid]; + for (const auto& q : pg_map.pg_stat) { + set pg_acting; // net acting sets (with no missing if degraded) + bool found = false; + if (q.second.state & PG_STATE_DEGRADED) { + for (auto& anm : q.second.avail_no_missing) { + if (osds.count(anm.osd)) { + found = true; + continue; + } + pg_acting.insert(anm.osd); + } + } else { + for (auto& a : q.second.acting) { + if (osds.count(a)) { + found = true; + continue; + } + pg_acting.insert(a); } } - } - for (auto& p : pg_delta) { - auto q = pg_map.pg_stat.find(p.first); - if (q == pg_map.pg_stat.end()) { - ss << "missing information about " << p.first << "; cannot draw" - << " any conclusions"; - r = -EAGAIN; - return; + if (!found) { + continue; } - if (!(q->second.state & PG_STATE_ACTIVE) || - (q->second.state & PG_STATE_DEGRADED)) { - // we don't currently have a good way to tell *how* degraded - // a degraded PG is, so we have to assume we cannot remove - // any more replicas/shards. + touched_pgs++; + if (!(q.second.state & PG_STATE_ACTIVE) || + (q.second.state & PG_STATE_DEGRADED)) { ++dangerous_pgs; continue; } - const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool()); + const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool()); if (!pi) { ++dangerous_pgs; // pool is creating or deleting } else { - if (q->second.acting.size() + p.second < pi->min_size) { + if (pg_acting.size() < pi->min_size) { ++dangerous_pgs; } } @@ -1553,14 +1558,15 @@ bool DaemonServer::_handle_command( return true; } if (dangerous_pgs) { - ss << dangerous_pgs << " PGs are already degraded or might become " - << "unavailable"; + ss << dangerous_pgs << " PGs are already too degraded, would become" + << " too degraded or might become unavailable"; cmdctx->reply(-EBUSY, ss); return true; } ss << "OSD(s) " << osds << " are ok to stop without reducing" - << " availability, provided there are no other concurrent failures" - << " or interventions. " << pg_delta.size() << " PGs are likely to be" + << " availability or risking data, provided there are no other concurrent failures" + << " or interventions." << std::endl; + ss << touched_pgs << " PGs are likely to be" << " degraded (but remain available) as a result."; cmdctx->reply(0, ss); return true; -- 2.39.5