From: Melissa Li Date: Thu, 5 Aug 2021 18:14:38 +0000 (-0400) Subject: mgr/cephadm: set health check warning for apply spec failures and daemon place failur... X-Git-Tag: v16.2.7~33^2~20 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9bd96ff24d9a9b9ea31d507ea1f7979a22c4c4df;p=ceph.git mgr/cephadm: set health check warning for apply spec failures and daemon place failures in serve Fixes: https://tracker.ceph.com/issues/44414 Signed-off-by: Melissa Li (cherry picked from commit 1ccdd941c938d6fc3cd8996353495b28545427b1) Conflicts: src/pybind/mgr/cephadm/module.py src/pybind/mgr/cephadm/serve.py --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 6f44e49502ff..c88a4a6bdbc1 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -407,7 +407,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.default_registry = '' self.autotune_memory_target_ratio = 0.0 self.autotune_interval = 0 - + self.apply_spec_fails: List[Tuple[str, str]] = [] self.max_osd_draining_count = 10 self._cons: Dict[str, Tuple[remoto.backends.BaseConnection, diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 186bd4dd7a14..acbdd9af53ad 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -548,14 +548,25 @@ class CephadmServe: specs = [] # type: List[ServiceSpec] for sn, spec in self.mgr.spec_store.active_specs.items(): specs.append(spec) + for name in ['CEPHADM_APPLY_SPEC_FAIL', 'CEPHADM_DAEMON_PLACE_FAIL']: + self.mgr.remove_health_warning(name) + self.mgr.apply_spec_fails = [] for spec in specs: try: if self._apply_service(spec): r = True except Exception as e: - self.log.exception('Failed to apply %s spec %s: %s' % ( - spec.service_name(), spec, e)) + msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}' + self.log.exception(msg) self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e)) + self.mgr.apply_spec_fails.append((spec.service_name(), str(e))) + warnings = [] + for x in self.mgr.apply_spec_fails: + warnings.append(f'{x[0]}: {x[1]}') + self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL', + f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}", + len(self.mgr.apply_spec_fails), + warnings) return r @@ -670,9 +681,17 @@ class CephadmServe: 'status', '').lower() not in ['maintenance', 'offline'] and d.hostname not in self.mgr.offline_hosts)] self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove)) except OrchestratorError as e: - self.log.error('Failed to apply %s spec %s: %s' % ( - spec.service_name(), spec, e)) + msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}' + self.log.error(msg) self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e)) + self.mgr.apply_spec_fails.append((spec.service_name(), str(e))) + warnings = [] + for x in self.mgr.apply_spec_fails: + warnings.append(f'{x[0]}: {x[1]}') + self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL', + f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}", + len(self.mgr.apply_spec_fails), + warnings) return False r = None @@ -746,6 +765,7 @@ class CephadmServe: svc.fence_old_ranks(spec, rank_map, len(all_slots)) # create daemons + daemon_place_fails = [] for slot in slots_to_add: # first remove daemon on conflicting port? if slot.ports: @@ -794,6 +814,7 @@ class CephadmServe: f"on {slot.hostname}: {e}") self.mgr.events.for_service(spec, 'ERROR', msg) self.mgr.log.error(msg) + daemon_place_fails.append(msg) # only return "no change" if no one else has already succeeded. # later successes will also change to True if r is None: @@ -810,6 +831,9 @@ class CephadmServe: ) daemons.append(sd) + if daemon_place_fails: + self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails) + # remove any? def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool: daemon_ids = [d.daemon_id for d in remove_daemons] diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index a83042cf49f2..85bc87a9dab1 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -858,6 +858,32 @@ class TestCephadm(object): 'entity': entity, }) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm") + def test_daemon_place_fail_health_warning(self, _run_cephadm, cephadm_module): + _run_cephadm.return_value = ('{}', '', 0) + with with_host(cephadm_module, 'test'): + _run_cephadm.side_effect = OrchestratorError('fail') + ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1) + r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps)) + assert not r + assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None + assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1 + assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary'] + assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail'] + + @mock.patch("cephadm.serve.CephadmServe._run_cephadm") + def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator): + _run_cephadm.return_value = ('{}', '', 0) + with with_host(cephadm_module, 'test'): + CephadmServe(cephadm_module)._apply_all_services() + ps = PlacementSpec(hosts=['fail'], count=1) + r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps)) + assert not r + assert cephadm_module.apply_spec_fails + assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None + assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1 + assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary'] + @mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option") @mock.patch("cephadm.serve.CephadmServe._run_cephadm") def test_invalid_config_option_health_warning(self, _run_cephadm, get_foreign_ceph_option, cephadm_module: CephadmOrchestrator):