mgr/cephadm: raise a better error on connection error during upgrade

author Adam King <adking@redhat.com>

Fri, 21 Oct 2022 17:12:53 +0000 (13:12 -0400)

committer Adam King <adking@redhat.com>

Fri, 17 Feb 2023 15:45:50 +0000 (10:45 -0500)
author Adam King <adking@redhat.com>
Fri, 21 Oct 2022 17:12:53 +0000 (13:12 -0400)
committer Adam King <adking@redhat.com>
Fri, 17 Feb 2023 15:45:50 +0000 (10:45 -0500)
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py

index 3b86dd78ac5115cdaf2b4d34530df28730b311a7..e40d7577299dfb7f64b326ec317b7effb885a066 100644 (file)
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
  from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
  from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
      MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
+from cephadm.ssh import HostConnectionError
  from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
  
  if TYPE_CHECKING:
@@ -118,7 +119,8 @@ class CephadmUpgrade:
          'UPGRADE_FAILED_PULL',
          'UPGRADE_REDEPLOY_DAEMON',
          'UPGRADE_BAD_TARGET_VERSION',
-        'UPGRADE_EXCEPTION'
+        'UPGRADE_EXCEPTION',
+        'UPGRADE_OFFLINE_HOST'
      ]
  
      def __init__(self, mgr: "CephadmOrchestrator"):
@@ -484,6 +486,14 @@ class CephadmUpgrade:
          if self.upgrade_state and not self.upgrade_state.paused:
              try:
                  self._do_upgrade()
+            except HostConnectionError as e:
+                self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
+                    'severity': 'error',
+                    'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
+                    'count': 1,
+                    'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
+                })
+                return False
              except Exception as e:
                  self._fail_upgrade('UPGRADE_EXCEPTION', {
                      'severity': 'error',
@@ -976,6 +986,18 @@ class CephadmUpgrade:
              logger.debug('_do_upgrade no state, exiting')
              return
  
+        if self.mgr.offline_hosts:
+            # offline host(s), on top of potential connection errors when trying to upgrade a daemon
+            # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
+            # whether or not that risk is present for any given offline hosts is a difficult problem,
+            # it's best to just fail upgrade cleanly so user can address the offline host(s)
+
+            # the HostConnectionError expects a hostname and addr, so let's just take
+            # one at random. It doesn't really matter which host we say we couldn't reach here.
+            hostname: str = list(self.mgr.offline_hosts)[0]
+            addr: str = self.mgr.inventory.get_addr(hostname)
+            raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
+
          target_image = self.target_image
          target_id = self.upgrade_state.target_id
          target_digests = self.upgrade_state.target_digests
author	Adam King <adking@redhat.com>
	Fri, 21 Oct 2022 17:12:53 +0000 (13:12 -0400)
committer	Adam King <adking@redhat.com>
	Fri, 17 Feb 2023 15:45:50 +0000 (10:45 -0500)