mgr/cephadm: upgrade: fail gracefully, if daemon redeploy fails

author Sebastian Wagner <sebastian.wagner@suse.com>

Tue, 17 Nov 2020 12:08:44 +0000 (13:08 +0100)

committer Sebastian Wagner <sebastian.wagner@suse.com>

Thu, 7 Jan 2021 12:02:21 +0000 (13:02 +0100)
author Sebastian Wagner <sebastian.wagner@suse.com>
Tue, 17 Nov 2020 12:08:44 +0000 (13:08 +0100)
committer Sebastian Wagner <sebastian.wagner@suse.com>
Thu, 7 Jan 2021 12:02:21 +0000 (13:02 +0100)
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py

index 6a130681d0cd51db9c41c6df3a181e90e51ff324..c7b26d592806fbf94878497ee0a2be61d025f122 100644 (file)
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -57,6 +57,12 @@ class UpgradeState:
  
  
  class CephadmUpgrade:
+    UPGRADE_ERRORS = [
+        'UPGRADE_NO_STANDBY_MGR',
+        'UPGRADE_FAILED_PULL',
+        'UPGRADE_REDEPLOY_DAEMON',
+    ]
+
      def __init__(self, mgr: "CephadmOrchestrator"):
          self.mgr = mgr
  
@@ -186,13 +192,13 @@ class CephadmUpgrade:
          return False
  
      def _clear_upgrade_health_checks(self) -> None:
-        for k in ['UPGRADE_NO_STANDBY_MGR',
-                  'UPGRADE_FAILED_PULL']:
+        for k in self.UPGRADE_ERRORS:
              if k in self.mgr.health_checks:
                  del self.mgr.health_checks[k]
          self.mgr.set_health_checks(self.mgr.health_checks)
  
-    def _fail_upgrade(self, alert_id, alert) -> None:
+    def _fail_upgrade(self, alert_id: str, alert: dict) -> None:
+        assert alert_id in self.UPGRADE_ERRORS
          logger.error('Upgrade: Paused due to %s: %s' % (alert_id,
                                                          alert['summary']))
          if not self.upgrade_state:
@@ -329,13 +335,23 @@ class CephadmUpgrade:
                      return
                  logger.info('Upgrade: Redeploying %s.%s' %
                              (d.daemon_type, d.daemon_id))
-                self.mgr._daemon_action(
-                    d.daemon_type,
-                    d.daemon_id,
-                    d.hostname,
-                    'redeploy',
-                    image=target_image
-                )
+                try:
+                    self.mgr._daemon_action(
+                        d.daemon_type,
+                        d.daemon_id,
+                        d.hostname,
+                        'redeploy',
+                        image=target_image
+                    )
+                except Exception as e:
+                    self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', {
+                        'severity': 'warning',
+                        'summary': f'Upgrading daemon {d.name()} on host {d.hostname} failed.',
+                        'count': 1,
+                        'detail': [
+                            f'Upgrade daemon: {d.name()}: {e}'
+                        ],
+                    })
                  return
  
              if need_upgrade_self:
author	Sebastian Wagner <sebastian.wagner@suse.com>
	Tue, 17 Nov 2020 12:08:44 +0000 (13:08 +0100)
committer	Sebastian Wagner <sebastian.wagner@suse.com>
	Thu, 7 Jan 2021 12:02:21 +0000 (13:02 +0100)