From 01f60cf4e0a751c314120c02956d4ff941eb71b4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 19 Jan 2021 16:49:08 -0600 Subject: [PATCH] mgr/cephadm: raise HEALTH_WARN when cephadm daemon in 'error' state If cephadm daemons are not happy we should raise a warning. Aside from being an important part of the user experience, this will also help us catch teuthology test errors. Fixes: https://tracker.ceph.com/issues/45628 Signed-off-by: Sage Weil --- src/pybind/mgr/cephadm/serve.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 20a9f94ef4c2e..8c4cdc8d8cddc 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -171,9 +171,14 @@ class CephadmServe: refresh(self.mgr.cache.get_hosts()) health_changed = False - if 'CEPHADM_HOST_CHECK_FAILED' in self.mgr.health_checks: - del self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED'] - health_changed = True + for k in [ + 'CEPHADM_HOST_CHECK_FAILED', + 'CEPHADM_FAILED_DAEMON' + 'CEPHADM_REFRESH_FAILED', + ]: + if k in self.mgr.health_checks: + del self.mgr.health_checks[k] + health_changed = True if bad_hosts: self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED'] = { 'severity': 'warning', @@ -190,8 +195,19 @@ class CephadmServe: 'detail': failures, } health_changed = True - elif 'CEPHADM_REFRESH_FAILED' in self.mgr.health_checks: - del self.mgr.health_checks['CEPHADM_REFRESH_FAILED'] + failed_daemons = [] + for dd in self.mgr.cache.get_daemons(): + if dd.status < 0: + failed_daemons.append('daemon %s on %s is in %s state' % ( + dd.name(), dd.hostname, dd.status_desc + )) + if failed_daemons: + self.mgr.health_checks['CEPHADM_FAILED_DAEMON'] = { + 'severity': 'warning', + 'summary': '%d failed cephadm daemon(s)' % len(failed_daemons), + 'count': len(failed_daemons), + 'detail': failed_daemons, + } health_changed = True if health_changed: self.mgr.set_health_checks(self.mgr.health_checks) -- 2.39.5