--- /dev/null
+
+from unittest import case
+import json
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster
+from tasks.cephfs.filesystem import CephCluster
+
+
+class MgrCluster(CephCluster):
+ def __init__(self, ctx):
+ super(MgrCluster, self).__init__(ctx)
+ self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+ if len(self.mgr_ids) == 0:
+ raise RuntimeError(
+ "This task requires at least one manager daemon")
+
+ self.mgr_daemons = dict(
+ [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+ in self.mgr_ids])
+
+ @property
+ def admin_remote(self):
+ first_mon = misc.get_first_mon(self._ctx, None)
+ (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys()
+ return result
+
+ def mgr_stop(self, mgr_id):
+ self.mgr_daemons[mgr_id].stop()
+
+ def mgr_fail(self, mgr_id):
+ self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+ def mgr_restart(self, mgr_id):
+ self.mgr_daemons[mgr_id].restart()
+
+ def get_mgr_map(self):
+ status = json.loads(
+ self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
+
+ return status["mgrmap"]
+
+ def get_active_id(self):
+ return self.get_mgr_map()["active_name"]
+
+ def get_standby_ids(self):
+ return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+
+class MgrTestCase(CephTestCase):
+ REQUIRE_MGRS = 1
+
+ def setUp(self):
+ super(MgrTestCase, self).setUp()
+
+ # The test runner should have populated this
+ assert self.mgr_cluster is not None
+
+ if len(self.mgr_cluster.mgr_ids) < self.REQUIRE_MGRS:
+ raise case.SkipTest("Only have {0} manager daemons, "
+ "{1} are required".format(
+ len(self.mgr_cluster.mgr_ids), self.REQUIRE_MGRS))
+
+ # Restart all the daemons
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.stop()
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.restart()
+
+ # Wait for an active to come up
+ self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
+ timeout=20)
+
+ expect_standbys = set(self.mgr_cluster.mgr_ids) \
+ - {self.mgr_cluster.get_active_id()}
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=20)
--- /dev/null
+
+import logging
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+ REQUIRE_MGRS = 2
+
+ def test_timeout(self):
+ """
+ That when an active mgr stops responding, a standby is promoted
+ after mon_mgr_beacon_grace.
+ """
+
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ # Stop that daemon
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Assert that the other mgr becomes active
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ self.mgr_cluster.mgr_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_explicit_fail(self):
+ """
+ That when a user explicitly fails a daemon, a standby immediately
+ replaces it.
+ :return:
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ # A standby should take over
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ # The one we failed should come back as a standby (he isn't
+ # really dead)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_standby_timeout(self):
+ """
+ That when a standby daemon stops sending beacons, it is
+ removed from the list of standbys
+ :return:
+ """
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ victim = original_standbys[0]
+ self.mgr_cluster.mgr_stop(victim)
+
+ expect_standbys = set(original_standbys) - {victim}
+
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=60
+ )
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)