]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: upgrade: fail gracefully, if daemon redeploy fails
authorSebastian Wagner <sebastian.wagner@suse.com>
Tue, 17 Nov 2020 12:08:44 +0000 (13:08 +0100)
committerSebastian Wagner <sebastian.wagner@suse.com>
Thu, 7 Jan 2021 12:02:21 +0000 (13:02 +0100)
Current behaviour kills the `serve()` loop of mgr/cephadm

Fixes: https://tracker.ceph.com/issues/47684
Signed-off-by: Sebastian Wagner <sebastian.wagner@suse.com>
(cherry picked from commit 1ceb6dfb15781569ece627fc53bbd1eb294bf2b7)

src/pybind/mgr/cephadm/upgrade.py

index 6a130681d0cd51db9c41c6df3a181e90e51ff324..c7b26d592806fbf94878497ee0a2be61d025f122 100644 (file)
@@ -57,6 +57,12 @@ class UpgradeState:
 
 
 class CephadmUpgrade:
+    UPGRADE_ERRORS = [
+        'UPGRADE_NO_STANDBY_MGR',
+        'UPGRADE_FAILED_PULL',
+        'UPGRADE_REDEPLOY_DAEMON',
+    ]
+
     def __init__(self, mgr: "CephadmOrchestrator"):
         self.mgr = mgr
 
@@ -186,13 +192,13 @@ class CephadmUpgrade:
         return False
 
     def _clear_upgrade_health_checks(self) -> None:
-        for k in ['UPGRADE_NO_STANDBY_MGR',
-                  'UPGRADE_FAILED_PULL']:
+        for k in self.UPGRADE_ERRORS:
             if k in self.mgr.health_checks:
                 del self.mgr.health_checks[k]
         self.mgr.set_health_checks(self.mgr.health_checks)
 
-    def _fail_upgrade(self, alert_id, alert) -> None:
+    def _fail_upgrade(self, alert_id: str, alert: dict) -> None:
+        assert alert_id in self.UPGRADE_ERRORS
         logger.error('Upgrade: Paused due to %s: %s' % (alert_id,
                                                         alert['summary']))
         if not self.upgrade_state:
@@ -329,13 +335,23 @@ class CephadmUpgrade:
                     return
                 logger.info('Upgrade: Redeploying %s.%s' %
                             (d.daemon_type, d.daemon_id))
-                self.mgr._daemon_action(
-                    d.daemon_type,
-                    d.daemon_id,
-                    d.hostname,
-                    'redeploy',
-                    image=target_image
-                )
+                try:
+                    self.mgr._daemon_action(
+                        d.daemon_type,
+                        d.daemon_id,
+                        d.hostname,
+                        'redeploy',
+                        image=target_image
+                    )
+                except Exception as e:
+                    self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', {
+                        'severity': 'warning',
+                        'summary': f'Upgrading daemon {d.name()} on host {d.hostname} failed.',
+                        'count': 1,
+                        'detail': [
+                            f'Upgrade daemon: {d.name()}: {e}'
+                        ],
+                    })
                 return
 
             if need_upgrade_self: