From: Sam Goyal Date: Wed, 9 Jul 2025 00:45:14 +0000 (-0700) Subject: mgr/cephadm: updating maintenance health status in the serve loop X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=de4d7a954a57420d02a320b8b2144edc7be81057;p=ceph.git mgr/cephadm: updating maintenance health status in the serve loop The HOST_IN_MAINTENANCE health warning would disappear when the ceph manager restarted or failed over, even though hosts remained in maintenance mode. This occurred because set_maintenance_healthcheck() was only called during host state transitions, not consistently during the serve loop. This commit changes the function name from set_maintenance_healthcheck() to update_maintenance_healthcheck() and then adds a call to update_maintenance_healthcheck() in the serve loop to ensure maintenance health warnings are restored and persist across manager restarts, similar to how the global pause health warning is handled. Fixes: https://tracker.ceph.com/issues/72034 Signed-off-by: Sam Goyal fixed test --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1d3f7e96a694..2b501c42a1c8 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1841,7 +1841,7 @@ Then run the following: self.inventory.add_host(spec) self.offline_hosts_remove(spec.hostname) if spec.status == 'maintenance': - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() self.event.set() # refresh stray health check self.log.info('Added host %s' % spec.hostname) return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr) @@ -2012,7 +2012,7 @@ Then run the following: self.ssh.reset_con(host) # if host was in offline host list, we should remove it now. self.offline_hosts_remove(host) - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() self.event.set() # refresh stray health check self.log.info('Removed host %s' % host) return "Removed {} host '{}'".format('offline' if offline else '', host) @@ -2127,7 +2127,7 @@ Then run the following: self.log.info(msg) return msg - def set_maintenance_healthcheck(self) -> None: + def update_maintenance_healthcheck(self) -> None: """Raise/update or clear the maintenance health check as needed""" in_maintenance = self.inventory.get_host_with_state("maintenance") @@ -2211,7 +2211,7 @@ Then run the following: self.inventory._inventory[hostname] = tgt_host self.inventory.save() - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode' @handle_orch_error @@ -2305,7 +2305,7 @@ Then run the following: # could have theoretically made a lot of changes to the host. self._invalidate_all_host_metadata_and_kick_serve(hostname) - self.set_maintenance_healthcheck() + self.update_maintenance_healthcheck() return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode" diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 68dc5e38b34d..3cb818073ce2 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -89,6 +89,7 @@ class CephadmServe: self._check_for_strays() self._update_paused_health() + self.mgr.update_maintenance_healthcheck() if self.mgr.need_connect_dashboard_rgw and self.mgr.config_dashboard: self.mgr.need_connect_dashboard_rgw = False diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index bcfd1e4b65fc..47ca24e03b87 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -2526,7 +2526,7 @@ Traceback (most recent call last): cephadm_module.cache.facts = facts assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value - @mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None) + @mock.patch("cephadm.CephadmOrchestrator.update_maintenance_healthcheck", lambda _: None) @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]')) def test_tuned_profiles_validation(self, cephadm_module): with with_host(cephadm_module, 'test'):