From: Sebastian Wagner Date: Tue, 17 Nov 2020 12:08:44 +0000 (+0100) Subject: mgr/cephadm: upgrade: fail gracefully, if daemon redeploy fails X-Git-Tag: v15.2.9~88^2~27 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=951e85a18ba20560bc4f8c1097537f81a8530ba2;p=ceph.git mgr/cephadm: upgrade: fail gracefully, if daemon redeploy fails Current behaviour kills the `serve()` loop of mgr/cephadm Fixes: https://tracker.ceph.com/issues/47684 Signed-off-by: Sebastian Wagner (cherry picked from commit 1ceb6dfb15781569ece627fc53bbd1eb294bf2b7) --- diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 6a130681d0cd..c7b26d592806 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -57,6 +57,12 @@ class UpgradeState: class CephadmUpgrade: + UPGRADE_ERRORS = [ + 'UPGRADE_NO_STANDBY_MGR', + 'UPGRADE_FAILED_PULL', + 'UPGRADE_REDEPLOY_DAEMON', + ] + def __init__(self, mgr: "CephadmOrchestrator"): self.mgr = mgr @@ -186,13 +192,13 @@ class CephadmUpgrade: return False def _clear_upgrade_health_checks(self) -> None: - for k in ['UPGRADE_NO_STANDBY_MGR', - 'UPGRADE_FAILED_PULL']: + for k in self.UPGRADE_ERRORS: if k in self.mgr.health_checks: del self.mgr.health_checks[k] self.mgr.set_health_checks(self.mgr.health_checks) - def _fail_upgrade(self, alert_id, alert) -> None: + def _fail_upgrade(self, alert_id: str, alert: dict) -> None: + assert alert_id in self.UPGRADE_ERRORS logger.error('Upgrade: Paused due to %s: %s' % (alert_id, alert['summary'])) if not self.upgrade_state: @@ -329,13 +335,23 @@ class CephadmUpgrade: return logger.info('Upgrade: Redeploying %s.%s' % (d.daemon_type, d.daemon_id)) - self.mgr._daemon_action( - d.daemon_type, - d.daemon_id, - d.hostname, - 'redeploy', - image=target_image - ) + try: + self.mgr._daemon_action( + d.daemon_type, + d.daemon_id, + d.hostname, + 'redeploy', + image=target_image + ) + except Exception as e: + self._fail_upgrade('UPGRADE_REDEPLOY_DAEMON', { + 'severity': 'warning', + 'summary': f'Upgrading daemon {d.name()} on host {d.hostname} failed.', + 'count': 1, + 'detail': [ + f'Upgrade daemon: {d.name()}: {e}' + ], + }) return if need_upgrade_self: