From 554be8c47888c6f344472be90b2d0520df16113c Mon Sep 17 00:00:00 2001 From: Sam Goyal Date: Tue, 8 Jul 2025 17:45:14 -0700 Subject: [PATCH] mgr/cephadm: updating maintenance health status in the serve loop The HOST_IN_MAINTENANCE health warning would disappear when the ceph manager restarted or failed over, even though hosts remained in maintenance mode. This occurred because set_maintenance_healthcheck() was only called during host state transitions, not consistently during the serve loop. This commit changes the function name from set_maintenance_healthcheck() to update_maintenance_healthcheck() and then adds a call to update_maintenance_healthcheck() in the serve loop to ensure maintenance health warnings are restored and persist across manager restarts, similar to how the global pause health warning is handled. Fixes: https://tracker.ceph.com/issues/72034 Signed-off-by: Sam Goyal fixed test (cherry picked from commit de4d7a954a57420d02a320b8b2144edc7be81057) --- src/pybind/mgr/cephadm/module.py | 10 +++++----- src/pybind/mgr/cephadm/serve.py | 1 + src/pybind/mgr/cephadm/tests/test_cephadm.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1d3f7e96a69..2b501c42a1c 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1841,7 +1841,7 @@ Then run the following: self.inventory.add_host(spec) self.offline_hosts_remove(spec.hostname) if spec.status == 'maintenance': - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() self.event.set() # refresh stray health check self.log.info('Added host %s' % spec.hostname) return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr) @@ -2012,7 +2012,7 @@ Then run the following: self.ssh.reset_con(host) # if host was in offline host list, we should remove it now. self.offline_hosts_remove(host) - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() self.event.set() # refresh stray health check self.log.info('Removed host %s' % host) return "Removed {} host '{}'".format('offline' if offline else '', host) @@ -2127,7 +2127,7 @@ Then run the following: self.log.info(msg) return msg - def set_maintenance_healthcheck(self) -> None: + def update_maintenance_healthcheck(self) -> None: """Raise/update or clear the maintenance health check as needed""" in_maintenance = self.inventory.get_host_with_state("maintenance") @@ -2211,7 +2211,7 @@ Then run the following: self.inventory._inventory[hostname] = tgt_host self.inventory.save() - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode' @handle_orch_error @@ -2305,7 +2305,7 @@ Then run the following: # could have theoretically made a lot of changes to the host. self._invalidate_all_host_metadata_and_kick_serve(hostname) - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode" diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index fa455c3cb68..b52748b655a 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -89,6 +89,7 @@ class CephadmServe: self._check_for_strays() self._update_paused_health() + self.mgr.update_maintenance_healthcheck() if self.mgr.need_connect_dashboard_rgw and self.mgr.config_dashboard: self.mgr.need_connect_dashboard_rgw = False diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index bcfd1e4b65f..47ca24e03b8 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -2526,7 +2526,7 @@ Traceback (most recent call last): cephadm_module.cache.facts = facts assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value - @mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None) + @mock.patch("cephadm.CephadmOrchestrator.update_maintenance_healthcheck", lambda _: None) @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) def test_tuned_profiles_validation(self, cephadm_module): with with_host(cephadm_module, 'test'): -- 2.39.5