From: Adam King Date: Fri, 21 Oct 2022 17:12:53 +0000 (-0400) Subject: mgr/cephadm: raise a better error on connection error during upgrade X-Git-Tag: v17.2.6~71^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=3787501849be5a9ce72c3df2f9ad536e21780684;p=ceph.git mgr/cephadm: raise a better error on connection error during upgrade Right now failures to connect to a host during the upgrade result in a "failed due to an unexpected exception" error. We can do a bit better than that. Fixes: https://tracker.ceph.com/issues/57891 Signed-off-by: Adam King (cherry picked from commit 225cd2d84b9753b985113998f7f3e969794884ed) --- diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 3b86dd78ac51..e40d7577299d 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \ MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES +from cephadm.ssh import HostConnectionError from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service if TYPE_CHECKING: @@ -118,7 +119,8 @@ class CephadmUpgrade: 'UPGRADE_FAILED_PULL', 'UPGRADE_REDEPLOY_DAEMON', 'UPGRADE_BAD_TARGET_VERSION', - 'UPGRADE_EXCEPTION' + 'UPGRADE_EXCEPTION', + 'UPGRADE_OFFLINE_HOST' ] def __init__(self, mgr: "CephadmOrchestrator"): @@ -484,6 +486,14 @@ class CephadmUpgrade: if self.upgrade_state and not self.upgrade_state.paused: try: self._do_upgrade() + except HostConnectionError as e: + self._fail_upgrade('UPGRADE_OFFLINE_HOST', { + 'severity': 'error', + 'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})', + 'count': 1, + 'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'], + }) + return False except Exception as e: self._fail_upgrade('UPGRADE_EXCEPTION', { 'severity': 'error', @@ -976,6 +986,18 @@ class CephadmUpgrade: logger.debug('_do_upgrade no state, exiting') return + if self.mgr.offline_hosts: + # offline host(s), on top of potential connection errors when trying to upgrade a daemon + # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating + # whether or not that risk is present for any given offline hosts is a difficult problem, + # it's best to just fail upgrade cleanly so user can address the offline host(s) + + # the HostConnectionError expects a hostname and addr, so let's just take + # one at random. It doesn't really matter which host we say we couldn't reach here. + hostname: str = list(self.mgr.offline_hosts)[0] + addr: str = self.mgr.inventory.get_addr(hostname) + raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr) + target_image = self.target_image target_id = self.upgrade_state.target_id target_digests = self.upgrade_state.target_digests