]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: raise a better error on connection error during upgrade
authorAdam King <adking@redhat.com>
Fri, 21 Oct 2022 17:12:53 +0000 (13:12 -0400)
committerAdam King <adking@redhat.com>
Fri, 17 Feb 2023 15:45:50 +0000 (10:45 -0500)
Right now failures to connect to a host during the upgrade result
in a "failed due to an unexpected exception" error. We can do a bit
better than that.

Fixes: https://tracker.ceph.com/issues/57891
Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 225cd2d84b9753b985113998f7f3e969794884ed)

src/pybind/mgr/cephadm/upgrade.py

index 3b86dd78ac5115cdaf2b4d34530df28730b311a7..e40d7577299dfb7f64b326ec317b7effb885a066 100644 (file)
@@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
     MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
 
 if TYPE_CHECKING:
@@ -118,7 +119,8 @@ class CephadmUpgrade:
         'UPGRADE_FAILED_PULL',
         'UPGRADE_REDEPLOY_DAEMON',
         'UPGRADE_BAD_TARGET_VERSION',
-        'UPGRADE_EXCEPTION'
+        'UPGRADE_EXCEPTION',
+        'UPGRADE_OFFLINE_HOST'
     ]
 
     def __init__(self, mgr: "CephadmOrchestrator"):
@@ -484,6 +486,14 @@ class CephadmUpgrade:
         if self.upgrade_state and not self.upgrade_state.paused:
             try:
                 self._do_upgrade()
+            except HostConnectionError as e:
+                self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
+                    'severity': 'error',
+                    'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
+                    'count': 1,
+                    'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
+                })
+                return False
             except Exception as e:
                 self._fail_upgrade('UPGRADE_EXCEPTION', {
                     'severity': 'error',
@@ -976,6 +986,18 @@ class CephadmUpgrade:
             logger.debug('_do_upgrade no state, exiting')
             return
 
+        if self.mgr.offline_hosts:
+            # offline host(s), on top of potential connection errors when trying to upgrade a daemon
+            # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
+            # whether or not that risk is present for any given offline hosts is a difficult problem,
+            # it's best to just fail upgrade cleanly so user can address the offline host(s)
+
+            # the HostConnectionError expects a hostname and addr, so let's just take
+            # one at random. It doesn't really matter which host we say we couldn't reach here.
+            hostname: str = list(self.mgr.offline_hosts)[0]
+            addr: str = self.mgr.inventory.get_addr(hostname)
+            raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
+
         target_image = self.target_image
         target_id = self.upgrade_state.target_id
         target_digests = self.upgrade_state.target_digests