From: Cory Snyder Date: Fri, 19 Apr 2024 15:42:00 +0000 (+0000) Subject: mgr/Mgr.cc: clear daemon health metrics instead of removing down/out osd from daemon... X-Git-Tag: v17.2.8~65^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=932d7edaeed1a280a91212cec89229ff2eda7b3f;p=ceph.git mgr/Mgr.cc: clear daemon health metrics instead of removing down/out osd from daemon state Reverts the change from https://github.com/ceph/ceph/pull/53993 and directly clears daemon health metrics for down and out OSDs. The former approach of removing down/out OSDs from the daemon state has undesirable consequences for stat output, including the prometheus exporter. Fixes: https://tracker.ceph.com/issues/66168 Signed-off-by: Cory Snyder (cherry picked from commit 282558cf40274366360bb3b1ec0fa102fbb592a6) --- diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc index b3d710f31abf..91557d1a2206 100644 --- a/src/mgr/Mgr.cc +++ b/src/mgr/Mgr.cc @@ -525,7 +525,7 @@ void Mgr::handle_osd_map() cluster_state.with_osdmap_and_pgmap([this, &names_exist](const OSDMap &osd_map, const PGMap &pg_map) { for (int osd_id = 0; osd_id < osd_map.get_max_osd(); ++osd_id) { - if (!osd_map.exists(osd_id) || (osd_map.is_out(osd_id) && osd_map.is_down(osd_id))) { + if (!osd_map.exists(osd_id)) { continue; } @@ -537,9 +537,16 @@ void Mgr::handle_osd_map() if (daemon_state.is_updating(k)) { continue; } + + DaemonStatePtr daemon = daemon_state.get(k); + + if (daemon && osd_map.is_out(osd_id) && osd_map.is_down(osd_id)) { + std::lock_guard l(daemon->lock); + daemon->daemon_health_metrics.clear(); + } bool update_meta = false; - if (daemon_state.exists(k)) { + if (daemon) { if (osd_map.get_up_from(osd_id) == osd_map.get_epoch()) { dout(4) << "Mgr::handle_osd_map: osd." << osd_id << " joined cluster at " << "e" << osd_map.get_epoch()