]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: do not crash module on exception in serve thread
authorSage Weil <sage@redhat.com>
Sat, 8 Feb 2020 16:05:39 +0000 (10:05 -0600)
committerSage Weil <sage@redhat.com>
Sat, 8 Feb 2020 16:14:48 +0000 (10:14 -0600)
This is a band-aid over a larger problem: we really need to process each
host independently and proceed with partial results when some hosts fail.
(Also, we should query hosts in parallel.)

In the meantime, this avoids crashing the cephadm module entirely.

Fixes: https://tracker.ceph.com/issues/44018
Signed-off-by: Sage Weil <sage@redhat.com>
src/pybind/mgr/cephadm/module.py

index 1a38ef0876c66d87b1f8ebc2088a2098485f9113..a1c2c8819e7f6523aabf8a4f37ddd7f9bf8efeb9 100644 (file)
@@ -747,6 +747,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                 }
         self.set_health_checks(self.health_checks)
 
+    def _serve_sleep(self):
+        sleep_interval = 600
+        self.log.debug('Sleeping for %d seconds', sleep_interval)
+        ret = self.event.wait(sleep_interval)
+        self.event.clear()
+
     def serve(self):
         # type: () -> None
         self.log.info("serve starting")
@@ -757,7 +763,23 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
             self.log.debug('refreshing services')
             completion = self._get_services(maybe_refresh=True)
             self._orchestrator_wait([completion])
-            orchestrator.raise_if_exception(completion)
+            # FIXME: this is a band-aid to avoid crashing the mgr, but what
+            # we really need to do here is raise health alerts for individual
+            # hosts that fail and continue with the ones that do not fail.
+            if completion.exception is not None:
+                self.log.error('failed to refresh services: %s' % completion.exception)
+                self.health_checks['CEPHADM_REFRESH_FAILED'] = {
+                    'severity': 'warning',
+                    'summary': 'failed to probe one or more hosts',
+                    'count': 1,
+                    'detail': [str(completion.exception)],
+                }
+                self.set_health_checks(self.health_checks)
+                self._serve_sleep()
+                continue
+            if 'CEPHADM_REFRESH_FAILED' in self.health_checks:
+                del self.health_checks['CEPHADM_REFRESH_FAILED']
+                self.set_health_checks(self.health_checks)
             services = completion.result
             self.log.debug('services %s' % services)
 
@@ -772,13 +794,11 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                             time.sleep(1)
                         else:
                             break
-                    orchestrator.raise_if_exception(completion)
+                    if completion.exception is not None:
+                        self.log.error(str(completion.exception))
                 self.log.debug('did _do_upgrade')
             else:
-                sleep_interval = 600
-                self.log.debug('Sleeping for %d seconds', sleep_interval)
-                ret = self.event.wait(sleep_interval)
-                self.event.clear()
+                self._serve_sleep()
         self.log.info("serve exit")
 
     def config_notify(self):