From 646a67a6060f4667e56a094682754da782dca770 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 8 Feb 2020 10:05:39 -0600 Subject: [PATCH] mgr/cephadm: do not crash module on exception in serve thread This is a band-aid over a larger problem: we really need to process each host independently and proceed with partial results when some hosts fail. (Also, we should query hosts in parallel.) In the meantime, this avoids crashing the cephadm module entirely. Fixes: https://tracker.ceph.com/issues/44018 Signed-off-by: Sage Weil --- src/pybind/mgr/cephadm/module.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1a38ef0876c..a1c2c8819e7 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -747,6 +747,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): } self.set_health_checks(self.health_checks) + def _serve_sleep(self): + sleep_interval = 600 + self.log.debug('Sleeping for %d seconds', sleep_interval) + ret = self.event.wait(sleep_interval) + self.event.clear() + def serve(self): # type: () -> None self.log.info("serve starting") @@ -757,7 +763,23 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self.log.debug('refreshing services') completion = self._get_services(maybe_refresh=True) self._orchestrator_wait([completion]) - orchestrator.raise_if_exception(completion) + # FIXME: this is a band-aid to avoid crashing the mgr, but what + # we really need to do here is raise health alerts for individual + # hosts that fail and continue with the ones that do not fail. + if completion.exception is not None: + self.log.error('failed to refresh services: %s' % completion.exception) + self.health_checks['CEPHADM_REFRESH_FAILED'] = { + 'severity': 'warning', + 'summary': 'failed to probe one or more hosts', + 'count': 1, + 'detail': [str(completion.exception)], + } + self.set_health_checks(self.health_checks) + self._serve_sleep() + continue + if 'CEPHADM_REFRESH_FAILED' in self.health_checks: + del self.health_checks['CEPHADM_REFRESH_FAILED'] + self.set_health_checks(self.health_checks) services = completion.result self.log.debug('services %s' % services) @@ -772,13 +794,11 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): time.sleep(1) else: break - orchestrator.raise_if_exception(completion) + if completion.exception is not None: + self.log.error(str(completion.exception)) self.log.debug('did _do_upgrade') else: - sleep_interval = 600 - self.log.debug('Sleeping for %d seconds', sleep_interval) - ret = self.event.wait(sleep_interval) - self.event.clear() + self._serve_sleep() self.log.info("serve exit") def config_notify(self): -- 2.39.5