]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: guard _check_daemons
authorSebastian Wagner <sebastian.wagner@suse.com>
Wed, 29 Jul 2020 10:55:04 +0000 (12:55 +0200)
committerSebastian Wagner <sebastian.wagner@suse.com>
Wed, 5 Aug 2020 10:51:28 +0000 (12:51 +0200)
Continue with other daemons, if one fails

Fixes: https://tracker.ceph.com/issues/46748
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
src/pybind/mgr/cephadm/inventory.py
src/pybind/mgr/cephadm/module.py

index 7c6b1e77c95f783137b764a41aeb87eca318bb75..10eccf769c2675251955f634bca2bb91dce3e58a 100644 (file)
@@ -514,10 +514,28 @@ class EventStore():
         e = OrchestratorEvent(datetime.datetime.utcnow(), 'service', spec.service_name(), level, message)
         self.add(e)
 
+    def from_orch_error(self, e: OrchestratorError):
+        if e.event_subject is not None:
+            self.add(OrchestratorEvent(
+                datetime.datetime.utcnow(),
+                e.event_subject[0],
+                e.event_subject[1],
+                "ERROR",
+                str(e)
+            ))
+
+
     def for_daemon(self, daemon_name, level, message):
         e = OrchestratorEvent(datetime.datetime.utcnow(), 'daemon', daemon_name, level, message)
         self.add(e)
 
+    def for_daemon_from_exception(self, daemon_name, e: Exception):
+        self.for_daemon(
+            daemon_name,
+            "ERROR",
+            str(e)
+        )
+
     def cleanup(self) -> None:
         # Needs to be properly done, in case events are persistently stored.
 
index 6c93a254843d5f8c7686d86c62d2f6aa7215d9d7..8ebe570cf8fe67a99e892622cdc84ae662fcd386 100644 (file)
@@ -514,13 +514,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
 
             except OrchestratorError as e:
                 if e.event_subject:
-                    self.events.add(OrchestratorEvent(
-                        datetime.datetime.utcnow(),
-                        e.event_subject[0],
-                        e.event_subject[1],
-                        "ERROR",
-                        str(e)
-                    ))
+                    self.events.from_orch_error(e)
 
             self._serve_sleep()
         self.log.debug("serve exit")
@@ -2125,12 +2119,19 @@ you may want to run:
                 self.log.info('Reconfiguring %s (monmap changed)...' % dd.name())
                 reconfig = True
             if reconfig:
-                self._create_daemon(
-                    CephadmDaemonSpec(
-                        host=dd.hostname,
-                        daemon_id=dd.daemon_id,
-                        daemon_type=dd.daemon_type),
-                    reconfig=True)
+                try:
+                    self._create_daemon(
+                        CephadmDaemonSpec(
+                            host=dd.hostname,
+                            daemon_id=dd.daemon_id,
+                            daemon_type=dd.daemon_type),
+                        reconfig=True)
+                except OrchestratorError as e:
+                    self.events.from_orch_error(e)
+                    # continue...
+                except Exception as e:
+                    self.events.for_daemon_from_exception(dd.name(), e)
+                    # continue...
 
         # do daemon post actions
         for daemon_type, daemon_descs in daemons_post.items():