]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: raise HEALTH_WARN when cephadm daemon in 'error' state 39169/head
authorSage Weil <sage@newdream.net>
Tue, 19 Jan 2021 22:49:08 +0000 (16:49 -0600)
committerSebastian Wagner <sebastian.wagner@suse.com>
Fri, 29 Jan 2021 15:11:13 +0000 (16:11 +0100)
If cephadm daemons are not happy we should raise a warning.  Aside from
being an important part of the user experience, this will also help us
catch teuthology test errors.

Fixes: https://tracker.ceph.com/issues/45628
Signed-off-by: Sage Weil <sage@newdream.net>
(cherry picked from commit 01f60cf4e0a751c314120c02956d4ff941eb71b4)

src/pybind/mgr/cephadm/serve.py

index 7808dc5baded94bac0433aa411476901e7e1d059..cfef14b423afce03ab26532d435750d67a55991f 100644 (file)
@@ -155,9 +155,14 @@ class CephadmServe:
         refresh(self.mgr.cache.get_hosts())
 
         health_changed = False
-        if 'CEPHADM_HOST_CHECK_FAILED' in self.mgr.health_checks:
-            del self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED']
-            health_changed = True
+        for k in [
+                'CEPHADM_HOST_CHECK_FAILED',
+                'CEPHADM_FAILED_DAEMON'
+                'CEPHADM_REFRESH_FAILED',
+        ]:
+            if k in self.mgr.health_checks:
+                del self.mgr.health_checks[k]
+                health_changed = True
         if bad_hosts:
             self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
                 'severity': 'warning',
@@ -174,8 +179,19 @@ class CephadmServe:
                 'detail': failures,
             }
             health_changed = True
-        elif 'CEPHADM_REFRESH_FAILED' in self.mgr.health_checks:
-            del self.mgr.health_checks['CEPHADM_REFRESH_FAILED']
+        failed_daemons = []
+        for dd in self.mgr.cache.get_daemons():
+            if dd.status < 0:
+                failed_daemons.append('daemon %s on %s is in %s state' % (
+                    dd.name(), dd.hostname, dd.status_desc
+                ))
+        if failed_daemons:
+            self.mgr.health_checks['CEPHADM_FAILED_DAEMON'] = {
+                'severity': 'warning',
+                'summary': '%d failed cephadm daemon(s)' % len(failed_daemons),
+                'count': len(failed_daemons),
+                'detail': failed_daemons,
+            }
             health_changed = True
         if health_changed:
             self.mgr.set_health_checks(self.mgr.health_checks)