From c9d8de36b6251da6386e854561016d3a7bc8549b Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 1 Dec 2021 04:01:28 -0500 Subject: [PATCH] mgr/cephadm: don't include agents in CEPHADM_FAILED_DAEMON They already have their own, more strict health warning. There's very few cases they would show up in failed daemon health check but not agent down health check and even if they did it would be temporary. Also, agents marked as down will automatically (before this change) be marked as failed even if they don't meet the typical criteria for failed (systemd status is in error) Fixes: https://tracker.ceph.com/issues/53448 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index b8cd31931ab..348824f89c2 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -754,9 +754,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, def update_failed_daemon_health_check(self) -> None: failed_daemons = [] for dd in self.cache.get_error_daemons(): - failed_daemons.append('daemon %s on %s is in %s state' % ( - dd.name(), dd.hostname, dd.status_desc - )) + if dd.daemon_type != 'agent': # agents tracked by CEPHADM_AGENT_DOWN + failed_daemons.append('daemon %s on %s is in %s state' % ( + dd.name(), dd.hostname, dd.status_desc + )) self.remove_health_warning('CEPHADM_FAILED_DAEMON') if failed_daemons: self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len( -- 2.47.3