From 313e091fee6b3399852683575f7431c1ba235133 Mon Sep 17 00:00:00 2001 From: Sebastian Wagner Date: Wed, 29 Jul 2020 12:55:04 +0200 Subject: [PATCH] mgr/cephadm: guard _check_daemons Continue with other daemons, if one fails Fixes: https://tracker.ceph.com/issues/46748 Signed-off-by: Sebastian Wagner --- src/pybind/mgr/cephadm/inventory.py | 18 ++++++++++++++++++ src/pybind/mgr/cephadm/module.py | 27 ++++++++++++++------------- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/pybind/mgr/cephadm/inventory.py b/src/pybind/mgr/cephadm/inventory.py index 7c6b1e77c95..10eccf769c2 100644 --- a/src/pybind/mgr/cephadm/inventory.py +++ b/src/pybind/mgr/cephadm/inventory.py @@ -514,10 +514,28 @@ class EventStore(): e = OrchestratorEvent(datetime.datetime.utcnow(), 'service', spec.service_name(), level, message) self.add(e) + def from_orch_error(self, e: OrchestratorError): + if e.event_subject is not None: + self.add(OrchestratorEvent( + datetime.datetime.utcnow(), + e.event_subject[0], + e.event_subject[1], + "ERROR", + str(e) + )) + + def for_daemon(self, daemon_name, level, message): e = OrchestratorEvent(datetime.datetime.utcnow(), 'daemon', daemon_name, level, message) self.add(e) + def for_daemon_from_exception(self, daemon_name, e: Exception): + self.for_daemon( + daemon_name, + "ERROR", + str(e) + ) + def cleanup(self) -> None: # Needs to be properly done, in case events are persistently stored. diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 6c93a254843..8ebe570cf8f 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -514,13 +514,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, except OrchestratorError as e: if e.event_subject: - self.events.add(OrchestratorEvent( - datetime.datetime.utcnow(), - e.event_subject[0], - e.event_subject[1], - "ERROR", - str(e) - )) + self.events.from_orch_error(e) self._serve_sleep() self.log.debug("serve exit") @@ -2125,12 +2119,19 @@ you may want to run: self.log.info('Reconfiguring %s (monmap changed)...' % dd.name()) reconfig = True if reconfig: - self._create_daemon( - CephadmDaemonSpec( - host=dd.hostname, - daemon_id=dd.daemon_id, - daemon_type=dd.daemon_type), - reconfig=True) + try: + self._create_daemon( + CephadmDaemonSpec( + host=dd.hostname, + daemon_id=dd.daemon_id, + daemon_type=dd.daemon_type), + reconfig=True) + except OrchestratorError as e: + self.events.from_orch_error(e) + # continue... + except Exception as e: + self.events.for_daemon_from_exception(dd.name(), e) + # continue... # do daemon post actions for daemon_type, daemon_descs in daemons_post.items(): -- 2.39.5