mgr/cephadm: retry mgr fail over in case of transient failure

author Adam King <adking@redhat.com>

Mon, 11 Apr 2022 20:57:51 +0000 (16:57 -0400)

committer Adam King <adking@redhat.com>

Wed, 13 Apr 2022 22:05:06 +0000 (18:05 -0400)
author Adam King <adking@redhat.com>
Mon, 11 Apr 2022 20:57:51 +0000 (16:57 -0400)
committer Adam King <adking@redhat.com>
Wed, 13 Apr 2022 22:05:06 +0000 (18:05 -0400)
diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py

index 9eaa90270780a842d0ac4ae959aeadd1c1eef7e4..8abb0e63a2c102181d7937c3cf8f99c357a4fbb9 100644 (file)
--- a/src/pybind/mgr/cephadm/services/cephadmservice.py
+++ b/src/pybind/mgr/cephadm/services/cephadmservice.py
@@ -3,6 +3,7 @@ import json
  import logging
  import re
  import socket
+import time
  from abc import ABCMeta, abstractmethod
  from typing import TYPE_CHECKING, List, Callable, TypeVar, \
      Optional, Dict, Any, Tuple, NewType, cast
@@ -672,19 +673,31 @@ class MgrService(CephService):
          return DaemonDescription()
  
      def fail_over(self) -> None:
-        if not self.mgr_map_has_standby():
-            raise OrchestratorError('Need standby mgr daemon', event_kind_subject=(
-                'daemon', 'mgr' + self.mgr.get_mgr_id()))
-
-        self.mgr.events.for_daemon('mgr' + self.mgr.get_mgr_id(),
-                                   'INFO', 'Failing over to other MGR')
-        logger.info('Failing over to other MGR')
-
-        # fail over
-        ret, out, err = self.mgr.check_mon_command({
-            'prefix': 'mgr fail',
-            'who': self.mgr.get_mgr_id(),
-        })
+        # this has been seen to sometimes transiently fail even when there are multiple
+        # mgr daemons. As long as there are multiple known mgr daemons, we should retry.
+        class NoStandbyError(OrchestratorError):
+            pass
+        no_standby_exc = NoStandbyError('Need standby mgr daemon', event_kind_subject=(
+            'daemon', 'mgr' + self.mgr.get_mgr_id()))
+        for sleep_secs in [2, 8, 15]:
+            try:
+                if not self.mgr_map_has_standby():
+                    raise no_standby_exc
+                self.mgr.events.for_daemon('mgr' + self.mgr.get_mgr_id(),
+                                           'INFO', 'Failing over to other MGR')
+                logger.info('Failing over to other MGR')
+
+                # fail over
+                ret, out, err = self.mgr.check_mon_command({
+                    'prefix': 'mgr fail',
+                    'who': self.mgr.get_mgr_id(),
+                })
+                return
+            except NoStandbyError:
+                logger.info(
+                    f'Failed to find standby mgr for failover. Retrying in {sleep_secs} seconds')
+                time.sleep(sleep_secs)
+        raise no_standby_exc
  
      def mgr_map_has_standby(self) -> bool:
          """
author	Adam King <adking@redhat.com>
	Mon, 11 Apr 2022 20:57:51 +0000 (16:57 -0400)
committer	Adam King <adking@redhat.com>
	Wed, 13 Apr 2022 22:05:06 +0000 (18:05 -0400)