From c64704a8932a7006dbee425f5cfffc41105edac7 Mon Sep 17 00:00:00 2001 From: Sebastian Wagner Date: Wed, 29 Jul 2020 12:55:04 +0200 Subject: [PATCH] mgr/cephadm: guard _check_daemons Continue with other daemons, if one fails Fixes: https://tracker.ceph.com/issues/46748 Signed-off-by: Sebastian Wagner (cherry picked from commit 313e091fee6b3399852683575f7431c1ba235133) --- src/pybind/mgr/cephadm/inventory.py | 18 ++++++++++++++++++ src/pybind/mgr/cephadm/module.py | 27 ++++++++++++++------------- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 46e353043cd74..4a76e15058f38 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -516,10 +516,28 @@ class EventStore(): e = OrchestratorEvent(datetime.datetime.utcnow(), 'service', spec.service_name(), level, message) self.add(e) + def from_orch_error(self, e: OrchestratorError): + if e.event_subject is not None: + self.add(OrchestratorEvent( + datetime.datetime.utcnow(), + e.event_subject[0], + e.event_subject[1], + "ERROR", + str(e) + )) + + def for_daemon(self, daemon_name, level, message): e = OrchestratorEvent(datetime.datetime.utcnow(), 'daemon', daemon_name, level, message) self.add(e) + def for_daemon_from_exception(self, daemon_name, e: Exception): + self.for_daemon( + daemon_name, + "ERROR", + str(e) + ) + def cleanup(self) -> None: # Needs to be properly done, in case events are persistently stored. diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index f2fd8901430d1..476c1d9923432 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -515,13 +515,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): except OrchestratorError as e: if e.event_subject: - self.events.add(OrchestratorEvent( - datetime.datetime.utcnow(), - e.event_subject[0], - e.event_subject[1], - "ERROR", - str(e) - )) + self.events.from_orch_error(e) self._serve_sleep() self.log.debug("serve exit") @@ -2126,12 +2120,19 @@ you may want to run: self.log.info('Reconfiguring %s (monmap changed)...' % dd.name()) reconfig = True if reconfig: - self._create_daemon( - CephadmDaemonSpec( - host=dd.hostname, - daemon_id=dd.daemon_id, - daemon_type=dd.daemon_type), - reconfig=True) + try: + self._create_daemon( + CephadmDaemonSpec( + host=dd.hostname, + daemon_id=dd.daemon_id, + daemon_type=dd.daemon_type), + reconfig=True) + except OrchestratorError as e: + self.events.from_orch_error(e) + # continue... + except Exception as e: + self.events.for_daemon_from_exception(dd.name(), e) + # continue... # do daemon post actions for daemon_type, daemon_descs in daemons_post.items(): -- 2.39.5