From 646a67a6060f4667e56a094682754da782dca770 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Sat, 8 Feb 2020 10:05:39 -0600
Subject: [PATCH] mgr/cephadm: do not crash module on exception in serve thread

This is a band-aid over a larger problem: we really need to process each
host independently and proceed with partial results when some hosts fail.
(Also, we should query hosts in parallel.)

In the meantime, this avoids crashing the cephadm module entirely.

Fixes: https://tracker.ceph.com/issues/44018
Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/pybind/mgr/cephadm/module.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 1a38ef0876c..a1c2c8819e7 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -747,6 +747,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                 }
         self.set_health_checks(self.health_checks)
 
+    def _serve_sleep(self):
+        sleep_interval = 600
+        self.log.debug('Sleeping for %d seconds', sleep_interval)
+        ret = self.event.wait(sleep_interval)
+        self.event.clear()
+
     def serve(self):
         # type: () -> None
         self.log.info("serve starting")
@@ -757,7 +763,23 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
             self.log.debug('refreshing services')
             completion = self._get_services(maybe_refresh=True)
             self._orchestrator_wait([completion])
-            orchestrator.raise_if_exception(completion)
+            # FIXME: this is a band-aid to avoid crashing the mgr, but what
+            # we really need to do here is raise health alerts for individual
+            # hosts that fail and continue with the ones that do not fail.
+            if completion.exception is not None:
+                self.log.error('failed to refresh services: %s' % completion.exception)
+                self.health_checks['CEPHADM_REFRESH_FAILED'] = {
+                    'severity': 'warning',
+                    'summary': 'failed to probe one or more hosts',
+                    'count': 1,
+                    'detail': [str(completion.exception)],
+                }
+                self.set_health_checks(self.health_checks)
+                self._serve_sleep()
+                continue
+            if 'CEPHADM_REFRESH_FAILED' in self.health_checks:
+                del self.health_checks['CEPHADM_REFRESH_FAILED']
+                self.set_health_checks(self.health_checks)
             services = completion.result
             self.log.debug('services %s' % services)
 
@@ -772,13 +794,11 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                             time.sleep(1)
                         else:
                             break
-                    orchestrator.raise_if_exception(completion)
+                    if completion.exception is not None:
+                        self.log.error(str(completion.exception))
                 self.log.debug('did _do_upgrade')
             else:
-                sleep_interval = 600
-                self.log.debug('Sleeping for %d seconds', sleep_interval)
-                ret = self.event.wait(sleep_interval)
-                self.event.clear()
+                self._serve_sleep()
         self.log.info("serve exit")
 
     def config_notify(self):
-- 
2.39.5