refresh(self.mgr.cache.get_hosts())
health_changed = False
- if 'CEPHADM_HOST_CHECK_FAILED' in self.mgr.health_checks:
- del self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED']
- health_changed = True
+ for k in [
+ 'CEPHADM_HOST_CHECK_FAILED',
+ 'CEPHADM_FAILED_DAEMON'
+ 'CEPHADM_REFRESH_FAILED',
+ ]:
+ if k in self.mgr.health_checks:
+ del self.mgr.health_checks[k]
+ health_changed = True
if bad_hosts:
self.mgr.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
'severity': 'warning',
'detail': failures,
}
health_changed = True
- elif 'CEPHADM_REFRESH_FAILED' in self.mgr.health_checks:
- del self.mgr.health_checks['CEPHADM_REFRESH_FAILED']
+ failed_daemons = []
+ for dd in self.mgr.cache.get_daemons():
+ if dd.status < 0:
+ failed_daemons.append('daemon %s on %s is in %s state' % (
+ dd.name(), dd.hostname, dd.status_desc
+ ))
+ if failed_daemons:
+ self.mgr.health_checks['CEPHADM_FAILED_DAEMON'] = {
+ 'severity': 'warning',
+ 'summary': '%d failed cephadm daemon(s)' % len(failed_daemons),
+ 'count': len(failed_daemons),
+ 'detail': failed_daemons,
+ }
health_changed = True
if health_changed:
self.mgr.set_health_checks(self.mgr.health_checks)