From: Sage Weil <sage@redhat.com>
Date: Sat, 8 Feb 2020 16:05:39 +0000 (-0600)
Subject: mgr/cephadm: do not crash module on exception in serve thread
X-Git-Tag: v15.1.1~465^2~1
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=646a67a6060f4667e56a094682754da782dca770;p=ceph-ci.git

mgr/cephadm: do not crash module on exception in serve thread

This is a band-aid over a larger problem: we really need to process each
host independently and proceed with partial results when some hosts fail.
(Also, we should query hosts in parallel.)

In the meantime, this avoids crashing the cephadm module entirely.

Fixes: https://tracker.ceph.com/issues/44018
Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 1a38ef0876c..a1c2c8819e7 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -747,6 +747,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                 }
         self.set_health_checks(self.health_checks)
 
+    def _serve_sleep(self):
+        sleep_interval = 600
+        self.log.debug('Sleeping for %d seconds', sleep_interval)
+        ret = self.event.wait(sleep_interval)
+        self.event.clear()
+
     def serve(self):
         # type: () -> None
         self.log.info("serve starting")
@@ -757,7 +763,23 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
             self.log.debug('refreshing services')
             completion = self._get_services(maybe_refresh=True)
             self._orchestrator_wait([completion])
-            orchestrator.raise_if_exception(completion)
+            # FIXME: this is a band-aid to avoid crashing the mgr, but what
+            # we really need to do here is raise health alerts for individual
+            # hosts that fail and continue with the ones that do not fail.
+            if completion.exception is not None:
+                self.log.error('failed to refresh services: %s' % completion.exception)
+                self.health_checks['CEPHADM_REFRESH_FAILED'] = {
+                    'severity': 'warning',
+                    'summary': 'failed to probe one or more hosts',
+                    'count': 1,
+                    'detail': [str(completion.exception)],
+                }
+                self.set_health_checks(self.health_checks)
+                self._serve_sleep()
+                continue
+            if 'CEPHADM_REFRESH_FAILED' in self.health_checks:
+                del self.health_checks['CEPHADM_REFRESH_FAILED']
+                self.set_health_checks(self.health_checks)
             services = completion.result
             self.log.debug('services %s' % services)
 
@@ -772,13 +794,11 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                             time.sleep(1)
                         else:
                             break
-                    orchestrator.raise_if_exception(completion)
+                    if completion.exception is not None:
+                        self.log.error(str(completion.exception))
                 self.log.debug('did _do_upgrade')
             else:
-                sleep_interval = 600
-                self.log.debug('Sleeping for %d seconds', sleep_interval)
-                ret = self.event.wait(sleep_interval)
-                self.event.clear()
+                self._serve_sleep()
         self.log.info("serve exit")
 
     def config_notify(self):