]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: raise HEALTH_WARN when cephadm daemon in 'error' state 38978/head
authorSage Weil <sage@newdream.net>
Tue, 19 Jan 2021 22:49:08 +0000 (16:49 -0600)
committerSage Weil <sage@newdream.net>
Tue, 19 Jan 2021 22:49:08 +0000 (16:49 -0600)
If cephadm daemons are not happy we should raise a warning.  Aside from
being an important part of the user experience, this will also help us
catch teuthology test errors.

Fixes: https://tracker.ceph.com/issues/45628
Signed-off-by: Sage Weil <sage@newdream.net>
src/pybind/mgr/cephadm/serve.py

index 20a9f94ef4c2e507b0b0f5a01e521a8b602833fc..8c4cdc8d8cddc40195fd4a9055e65cb054ae488f 100644 (file)
@@ -171,9 +171,14 @@ class CephadmServe:
         refresh(self.mgr.cache.get_hosts())
 
         health_changed = False
-        if 'CEPHADM_HOST_CHECK_FAILED' in self.mgr.health_checks:
-            del self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED']
-            health_changed = True
+        for k in [
+                'CEPHADM_HOST_CHECK_FAILED',
+                'CEPHADM_FAILED_DAEMON'
+                'CEPHADM_REFRESH_FAILED',
+        ]:
+            if k in self.mgr.health_checks:
+                del self.mgr.health_checks[k]
+                health_changed = True
         if bad_hosts:
             self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
                 'severity': 'warning',
@@ -190,8 +195,19 @@ class CephadmServe:
                 'detail': failures,
             }
             health_changed = True
-        elif 'CEPHADM_REFRESH_FAILED' in self.mgr.health_checks:
-            del self.mgr.health_checks['CEPHADM_REFRESH_FAILED']
+        failed_daemons = []
+        for dd in self.mgr.cache.get_daemons():
+            if dd.status < 0:
+                failed_daemons.append('daemon %s on %s is in %s state' % (
+                    dd.name(), dd.hostname, dd.status_desc
+                ))
+        if failed_daemons:
+            self.mgr.health_checks['CEPHADM_FAILED_DAEMON'] = {
+                'severity': 'warning',
+                'summary': '%d failed cephadm daemon(s)' % len(failed_daemons),
+                'count': len(failed_daemons),
+                'detail': failed_daemons,
+            }
             health_changed = True
         if health_changed:
             self.mgr.set_health_checks(self.mgr.health_checks)