From 3fe2d7d553d475f1fe3840c98ee31d71f6188a1a Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 11 Apr 2022 16:57:51 -0400 Subject: [PATCH] mgr/cephadm: retry mgr fail over in case of transient failure Fixes: https://tracker.ceph.com/issues/55279 Signed-off-by: Adam King --- .../mgr/cephadm/services/cephadmservice.py | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 9eaa90270780a..8abb0e63a2c10 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -3,6 +3,7 @@ import json import logging import re import socket +import time from abc import ABCMeta, abstractmethod from typing import TYPE_CHECKING, List, Callable, TypeVar, \ Optional, Dict, Any, Tuple, NewType, cast @@ -672,19 +673,31 @@ class MgrService(CephService): return DaemonDescription() def fail_over(self) -> None: - if not self.mgr_map_has_standby(): - raise OrchestratorError('Need standby mgr daemon', event_kind_subject=( - 'daemon', 'mgr' + self.mgr.get_mgr_id())) - - self.mgr.events.for_daemon('mgr' + self.mgr.get_mgr_id(), - 'INFO', 'Failing over to other MGR') - logger.info('Failing over to other MGR') - - # fail over - ret, out, err = self.mgr.check_mon_command({ - 'prefix': 'mgr fail', - 'who': self.mgr.get_mgr_id(), - }) + # this has been seen to sometimes transiently fail even when there are multiple + # mgr daemons. As long as there are multiple known mgr daemons, we should retry. + class NoStandbyError(OrchestratorError): + pass + no_standby_exc = NoStandbyError('Need standby mgr daemon', event_kind_subject=( + 'daemon', 'mgr' + self.mgr.get_mgr_id())) + for sleep_secs in [2, 8, 15]: + try: + if not self.mgr_map_has_standby(): + raise no_standby_exc + self.mgr.events.for_daemon('mgr' + self.mgr.get_mgr_id(), + 'INFO', 'Failing over to other MGR') + logger.info('Failing over to other MGR') + + # fail over + ret, out, err = self.mgr.check_mon_command({ + 'prefix': 'mgr fail', + 'who': self.mgr.get_mgr_id(), + }) + return + except NoStandbyError: + logger.info( + f'Failed to find standby mgr for failover. Retrying in {sleep_secs} seconds') + time.sleep(sleep_secs) + raise no_standby_exc def mgr_map_has_standby(self) -> bool: """ -- 2.39.5