From 554be8c47888c6f344472be90b2d0520df16113c Mon Sep 17 00:00:00 2001
From: Sam Goyal <sam.goyal@clyso.com>
Date: Tue, 8 Jul 2025 17:45:14 -0700
Subject: [PATCH] mgr/cephadm: updating maintenance health status in the serve
 loop

The HOST_IN_MAINTENANCE health warning would disappear when the ceph manager
restarted or failed over, even though hosts remained in maintenance mode.
This occurred because set_maintenance_healthcheck() was only called during
host state transitions, not consistently during the serve loop.

This commit changes the function name from set_maintenance_healthcheck() to update_maintenance_healthcheck()
and then adds a call to update_maintenance_healthcheck() in the serve loop
to ensure maintenance health warnings are restored and persist across
manager restarts, similar to how the global pause health warning is handled.

Fixes: https://tracker.ceph.com/issues/72034
Signed-off-by: Sam Goyal <sam.goyal@clyso.com>

fixed test

(cherry picked from commit de4d7a954a57420d02a320b8b2144edc7be81057)
---
 src/pybind/mgr/cephadm/module.py             | 10 +++++-----
 src/pybind/mgr/cephadm/serve.py              |  1 +
 src/pybind/mgr/cephadm/tests/test_cephadm.py |  2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 1d3f7e96a69..2b501c42a1c 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1841,7 +1841,7 @@ Then run the following:
         self.inventory.add_host(spec)
         self.offline_hosts_remove(spec.hostname)
         if spec.status == 'maintenance':
-            self.set_maintenance_healthcheck()
+            self.update_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Added host %s' % spec.hostname)
         return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr)
@@ -2012,7 +2012,7 @@ Then run the following:
         self.ssh.reset_con(host)
         # if host was in offline host list, we should remove it now.
         self.offline_hosts_remove(host)
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
         self.event.set()  # refresh stray health check
         self.log.info('Removed host %s' % host)
         return "Removed {} host '{}'".format('offline' if offline else '', host)
@@ -2127,7 +2127,7 @@ Then run the following:
         self.log.info(msg)
         return msg
 
-    def set_maintenance_healthcheck(self) -> None:
+    def update_maintenance_healthcheck(self) -> None:
         """Raise/update or clear the maintenance health check as needed"""
 
         in_maintenance = self.inventory.get_host_with_state("maintenance")
@@ -2211,7 +2211,7 @@ Then run the following:
         self.inventory._inventory[hostname] = tgt_host
         self.inventory.save()
 
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
         return f'Daemons for Ceph cluster {self._cluster_fsid} stopped on host {hostname}. Host {hostname} moved to maintenance mode'
 
     @handle_orch_error
@@ -2305,7 +2305,7 @@ Then run the following:
         # could have theoretically made a lot of changes to the host.
         self._invalidate_all_host_metadata_and_kick_serve(hostname)
 
-        self.set_maintenance_healthcheck()
+        self.update_maintenance_healthcheck()
 
         return f"Ceph cluster {self._cluster_fsid} on {hostname} has exited maintenance mode"
 
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index fa455c3cb68..b52748b655a 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -89,6 +89,7 @@ class CephadmServe:
                 self._check_for_strays()
 
                 self._update_paused_health()
+                self.mgr.update_maintenance_healthcheck()
 
                 if self.mgr.need_connect_dashboard_rgw and self.mgr.config_dashboard:
                     self.mgr.need_connect_dashboard_rgw = False
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index bcfd1e4b65f..47ca24e03b8 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -2526,7 +2526,7 @@ Traceback (most recent call last):
             cephadm_module.cache.facts = facts
             assert cephadm_module._validate_tunedprofile_settings(spec) == expected_value
 
-    @mock.patch("cephadm.CephadmOrchestrator.set_maintenance_healthcheck", lambda _: None)
+    @mock.patch("cephadm.CephadmOrchestrator.update_maintenance_healthcheck", lambda _: None)
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('[]'))
     def test_tuned_profiles_validation(self, cephadm_module):
         with with_host(cephadm_module, 'test'):
-- 
2.39.5