]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: guard _check_daemons
authorSebastian Wagner <sebastian.wagner@suse.com>
Wed, 29 Jul 2020 10:55:04 +0000 (12:55 +0200)
committerSebastian Wagner <sebastian.wagner@suse.com>
Fri, 21 Aug 2020 11:04:01 +0000 (13:04 +0200)
Continue with other daemons, if one fails

Fixes: https://tracker.ceph.com/issues/46748
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
(cherry picked from commit 313e091fee6b3399852683575f7431c1ba235133)

src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/module.py

index 46e353043cd749aed2e6eff3d868c89234b2a56c..4a76e15058f38df14c98c7993edc79455e0c7277 100644 (file)
@@ -516,10 +516,28 @@ class EventStore():
         e = OrchestratorEvent(datetime.datetime.utcnow(), 'service', spec.service_name(), level, message)
         self.add(e)
 
+    def from_orch_error(self, e: OrchestratorError):
+        if e.event_subject is not None:
+            self.add(OrchestratorEvent(
+                datetime.datetime.utcnow(),
+                e.event_subject[0],
+                e.event_subject[1],
+                "ERROR",
+                str(e)
+            ))
+
+
     def for_daemon(self, daemon_name, level, message):
         e = OrchestratorEvent(datetime.datetime.utcnow(), 'daemon', daemon_name, level, message)
         self.add(e)
 
+    def for_daemon_from_exception(self, daemon_name, e: Exception):
+        self.for_daemon(
+            daemon_name,
+            "ERROR",
+            str(e)
+        )
+
     def cleanup(self) -> None:
         # Needs to be properly done, in case events are persistently stored.
 
index f2fd8901430d1e988a0ffdab52fd2310124f3ddb..476c1d9923432082eed70b73facaa7eac8fd96a4 100644 (file)
@@ -515,13 +515,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
 
             except OrchestratorError as e:
                 if e.event_subject:
-                    self.events.add(OrchestratorEvent(
-                        datetime.datetime.utcnow(),
-                        e.event_subject[0],
-                        e.event_subject[1],
-                        "ERROR",
-                        str(e)
-                    ))
+                    self.events.from_orch_error(e)
 
             self._serve_sleep()
         self.log.debug("serve exit")
@@ -2126,12 +2120,19 @@ you may want to run:
                 self.log.info('Reconfiguring %s (monmap changed)...' % dd.name())
                 reconfig = True
             if reconfig:
-                self._create_daemon(
-                    CephadmDaemonSpec(
-                        host=dd.hostname,
-                        daemon_id=dd.daemon_id,
-                        daemon_type=dd.daemon_type),
-                    reconfig=True)
+                try:
+                    self._create_daemon(
+                        CephadmDaemonSpec(
+                            host=dd.hostname,
+                            daemon_id=dd.daemon_id,
+                            daemon_type=dd.daemon_type),
+                        reconfig=True)
+                except OrchestratorError as e:
+                    self.events.from_orch_error(e)
+                    # continue...
+                except Exception as e:
+                    self.events.for_daemon_from_exception(dd.name(), e)
+                    # continue...
 
         # do daemon post actions
         for daemon_type, daemon_descs in daemons_post.items():