]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: updating maintenance health status in the serve loop 64402/head
authorSam Goyal <sam.goyal@clyso.com>
Wed, 9 Jul 2025 00:45:14 +0000 (17:45 -0700)
committerSam Goyal <sam.goyal@clyso.com>
Wed, 9 Jul 2025 17:24:46 +0000 (10:24 -0700)
The HOST_IN_MAINTENANCE health warning would disappear when the ceph manager
restarted or failed over, even though hosts remained in maintenance mode.
This occurred because set_maintenance_healthcheck() was only called during
host state transitions, not consistently during the serve loop.

This commit changes the function name from set_maintenance_healthcheck() to update_maintenance_healthcheck()
and then adds a call to update_maintenance_healthcheck() in the serve loop
to ensure maintenance health warnings are restored and persist across
manager restarts, similar to how the global pause health warning is handled.

Fixes: https://tracker.ceph.com/issues/72034
Signed-off-by: Sam Goyal <sam.goyal@clyso.com>
fixed test

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 1d3f7e96a694af586f7db9801b0a21dcaff7253f..2b501c42a1c8e8cceb128de6763fe2aa0ac51302 100644 (file)
@@ -1841,7 +1841,7 @@ Then run the following:
         self.inventory.add_host(spec)
         self.offline_hosts_remove(spec.hostname)
         if spec.status == 'maintenance':
-            self.set_maintenance_healthcheck()
+            self.update_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Added host %s' % spec.hostname)
         return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr)
@@ -2012,7 +2012,7 @@ Then run the following:
         self.ssh.reset_con(host)
         # if host was in offline host list, we should remove it now.
         self.offline_hosts_remove(host)
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Removed host %s' % host)
         return "Removed {} host '{}'".format('offline' if offline else '', host)
@@ -2127,7 +2127,7 @@ Then run the following:
         self.log.info(msg)
         return msg
 
-    def set_maintenance_healthcheck(self) -> None:
+    def update_maintenance_healthcheck(self) -> None:
         """Raise/update or clear the maintenance health check as needed"""
 
         in_maintenance = self.inventory.get_host_with_state("maintenance")
@@ -2211,7 +2211,7 @@ Then run the following:
         self.inventory._inventory[hostname] = tgt_host
         self.inventory.save()
 
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
         return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode'
 
     @handle_orch_error
@@ -2305,7 +2305,7 @@ Then run the following:
         # could have theoretically made a lot of changes to the host.
         self._invalidate_all_host_metadata_and_kick_serve(hostname)
 
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
 
         return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode"
 
index 68dc5e38b34dcecc4eb927f70d565475413f3bf9..3cb818073ce26646f2755e5ad94160441e4297f9 100644 (file)
@@ -89,6 +89,7 @@ class CephadmServe:
                 self._check_for_strays()
 
                 self._update_paused_health()
+                self.mgr.update_maintenance_healthcheck()
 
                 if self.mgr.need_connect_dashboard_rgw and self.mgr.config_dashboard:
                     self.mgr.need_connect_dashboard_rgw = False
index bcfd1e4b65fcc663dfaa4e41b723c4068ccadbf4..47ca24e03b87601ca7ec26e281ea23573fab9cb6 100644 (file)
@@ -2526,7 +2526,7 @@ Traceback (most recent call last):
             cephadm_module.cache.facts = facts
             assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value
 
-    @mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None)
+    @mock.patch("cephadm.CephadmOrchestrator.update_maintenance_healthcheck", lambda _: None)
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
     def test_tuned_profiles_validation(self, cephadm_module):
         with with_host(cephadm_module, 'test'):