From: Melissa Li
Date: Thu, 5 Aug 2021 18:14:38 +0000 (-0400)
Subject: mgr/cephadm: set health check warning for apply spec failures and daemon place failur...
X-Git-Tag: v16.2.7~33^2~20
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9bd96ff24d9a9b9ea31d507ea1f7979a22c4c4df;p=ceph.git
mgr/cephadm: set health check warning for apply spec failures and daemon place failures in serve
Fixes: https://tracker.ceph.com/issues/44414
Signed-off-by: Melissa Li
(cherry picked from commit 1ccdd941c938d6fc3cd8996353495b28545427b1)
Conflicts:
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
---
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 6f44e49502ff..c88a4a6bdbc1 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -407,7 +407,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
self.default_registry = ''
self.autotune_memory_target_ratio = 0.0
self.autotune_interval = 0
-
+ self.apply_spec_fails: List[Tuple[str, str]] = []
self.max_osd_draining_count = 10
self._cons: Dict[str, Tuple[remoto.backends.BaseConnection,
diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py
index 186bd4dd7a14..acbdd9af53ad 100644
--- a/src/pybind/mgr/cephadm/serve.py
+++ b/src/pybind/mgr/cephadm/serve.py
@@ -548,14 +548,25 @@ class CephadmServe:
specs = [] # type: List[ServiceSpec]
for sn, spec in self.mgr.spec_store.active_specs.items():
specs.append(spec)
+ for name in ['CEPHADM_APPLY_SPEC_FAIL', 'CEPHADM_DAEMON_PLACE_FAIL']:
+ self.mgr.remove_health_warning(name)
+ self.mgr.apply_spec_fails = []
for spec in specs:
try:
if self._apply_service(spec):
r = True
except Exception as e:
- self.log.exception('Failed to apply %s spec %s: %s' % (
- spec.service_name(), spec, e))
+ msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+ self.log.exception(msg)
self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+ self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+ warnings = []
+ for x in self.mgr.apply_spec_fails:
+ warnings.append(f'{x[0]}: {x[1]}')
+ self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+ f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+ len(self.mgr.apply_spec_fails),
+ warnings)
return r
@@ -670,9 +681,17 @@ class CephadmServe:
'status', '').lower() not in ['maintenance', 'offline'] and d.hostname not in self.mgr.offline_hosts)]
self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove))
except OrchestratorError as e:
- self.log.error('Failed to apply %s spec %s: %s' % (
- spec.service_name(), spec, e))
+ msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+ self.log.error(msg)
self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+ self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+ warnings = []
+ for x in self.mgr.apply_spec_fails:
+ warnings.append(f'{x[0]}: {x[1]}')
+ self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+ f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+ len(self.mgr.apply_spec_fails),
+ warnings)
return False
r = None
@@ -746,6 +765,7 @@ class CephadmServe:
svc.fence_old_ranks(spec, rank_map, len(all_slots))
# create daemons
+ daemon_place_fails = []
for slot in slots_to_add:
# first remove daemon on conflicting port?
if slot.ports:
@@ -794,6 +814,7 @@ class CephadmServe:
f"on {slot.hostname}: {e}")
self.mgr.events.for_service(spec, 'ERROR', msg)
self.mgr.log.error(msg)
+ daemon_place_fails.append(msg)
# only return "no change" if no one else has already succeeded.
# later successes will also change to True
if r is None:
@@ -810,6 +831,9 @@ class CephadmServe:
)
daemons.append(sd)
+ if daemon_place_fails:
+ self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails)
+
# remove any?
def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
daemon_ids = [d.daemon_id for d in remove_daemons]
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py
index a83042cf49f2..85bc87a9dab1 100644
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -858,6 +858,32 @@ class TestCephadm(object):
'entity': entity,
})
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+ def test_daemon_place_fail_health_warning(self, _run_cephadm, cephadm_module):
+ _run_cephadm.return_value = ('{}', '', 0)
+ with with_host(cephadm_module, 'test'):
+ _run_cephadm.side_effect = OrchestratorError('fail')
+ ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1)
+ r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+ assert not r
+ assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None
+ assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1
+ assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary']
+ assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail']
+
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+ def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+ _run_cephadm.return_value = ('{}', '', 0)
+ with with_host(cephadm_module, 'test'):
+ CephadmServe(cephadm_module)._apply_all_services()
+ ps = PlacementSpec(hosts=['fail'], count=1)
+ r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+ assert not r
+ assert cephadm_module.apply_spec_fails
+ assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None
+ assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1
+ assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary']
+
@mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option")
@mock.patch("cephadm.serve.CephadmServe._run_cephadm")
def test_invalid_config_option_health_warning(self, _run_cephadm, get_foreign_ceph_option, cephadm_module: CephadmOrchestrator):