From: John Spray Date: Sat, 16 Jul 2016 21:16:53 +0000 (+0100) Subject: tasks: create ceph-mgr tests X-Git-Tag: v11.1.1~58^2^2~62^2~4 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=38c23c18413bbc9c80bd01b8d458519a04d70aff;p=ceph-ci.git tasks: create ceph-mgr tests Signed-off-by: John Spray --- diff --git a/tasks/mgr/__init__.py b/tasks/mgr/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tasks/mgr/mgr_test_case.py b/tasks/mgr/mgr_test_case.py new file mode 100644 index 00000000000..444a14e55cc --- /dev/null +++ b/tasks/mgr/mgr_test_case.py @@ -0,0 +1,85 @@ + +from unittest import case +import json + +from teuthology import misc +from tasks.ceph_test_case import CephTestCase + +# TODO move definition of CephCluster +from tasks.cephfs.filesystem import CephCluster + + +class MgrCluster(CephCluster): + def __init__(self, ctx): + super(MgrCluster, self).__init__(ctx) + self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr')) + + if len(self.mgr_ids) == 0: + raise RuntimeError( + "This task requires at least one manager daemon") + + self.mgr_daemons = dict( + [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id + in self.mgr_ids]) + + @property + def admin_remote(self): + first_mon = misc.get_first_mon(self._ctx, None) + (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys() + return result + + def mgr_stop(self, mgr_id): + self.mgr_daemons[mgr_id].stop() + + def mgr_fail(self, mgr_id): + self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id) + + def mgr_restart(self, mgr_id): + self.mgr_daemons[mgr_id].restart() + + def get_mgr_map(self): + status = json.loads( + self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty")) + + return status["mgrmap"] + + def get_active_id(self): + return self.get_mgr_map()["active_name"] + + def get_standby_ids(self): + return [s['name'] for s in self.get_mgr_map()["standbys"]] + + +class MgrTestCase(CephTestCase): + REQUIRE_MGRS = 1 + + def setUp(self): + super(MgrTestCase, self).setUp() + + # The test runner should have populated this + assert self.mgr_cluster is not None + + if len(self.mgr_cluster.mgr_ids) < self.REQUIRE_MGRS: + raise case.SkipTest("Only have {0} manager daemons, " + "{1} are required".format( + len(self.mgr_cluster.mgr_ids), self.REQUIRE_MGRS)) + + # Restart all the daemons + for daemon in self.mgr_cluster.mgr_daemons.values(): + daemon.stop() + + for mgr_id in self.mgr_cluster.mgr_ids: + self.mgr_cluster.mgr_fail(mgr_id) + + for daemon in self.mgr_cluster.mgr_daemons.values(): + daemon.restart() + + # Wait for an active to come up + self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "", + timeout=20) + + expect_standbys = set(self.mgr_cluster.mgr_ids) \ + - {self.mgr_cluster.get_active_id()} + self.wait_until_true( + lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys, + timeout=20) diff --git a/tasks/mgr/test_failover.py b/tasks/mgr/test_failover.py new file mode 100644 index 00000000000..8994ad49bfe --- /dev/null +++ b/tasks/mgr/test_failover.py @@ -0,0 +1,81 @@ + +import logging + +from tasks.mgr.mgr_test_case import MgrTestCase + + +log = logging.getLogger(__name__) + + +class TestFailover(MgrTestCase): + REQUIRE_MGRS = 2 + + def test_timeout(self): + """ + That when an active mgr stops responding, a standby is promoted + after mon_mgr_beacon_grace. + """ + + # Query which mgr is active + original_active = self.mgr_cluster.get_active_id() + original_standbys = self.mgr_cluster.get_standby_ids() + + # Stop that daemon + self.mgr_cluster.mgr_stop(original_active) + + # Assert that the other mgr becomes active + self.wait_until_true( + lambda: self.mgr_cluster.get_active_id() in original_standbys, + timeout=60 + ) + + self.mgr_cluster.mgr_restart(original_active) + self.wait_until_true( + lambda: original_active in self.mgr_cluster.get_standby_ids(), + timeout=10 + ) + + def test_explicit_fail(self): + """ + That when a user explicitly fails a daemon, a standby immediately + replaces it. + :return: + """ + # Query which mgr is active + original_active = self.mgr_cluster.get_active_id() + original_standbys = self.mgr_cluster.get_standby_ids() + + self.mgr_cluster.mgr_fail(original_active) + + # A standby should take over + self.wait_until_true( + lambda: self.mgr_cluster.get_active_id() in original_standbys, + timeout=60 + ) + + # The one we failed should come back as a standby (he isn't + # really dead) + self.wait_until_true( + lambda: original_active in self.mgr_cluster.get_standby_ids(), + timeout=10 + ) + + def test_standby_timeout(self): + """ + That when a standby daemon stops sending beacons, it is + removed from the list of standbys + :return: + """ + original_active = self.mgr_cluster.get_active_id() + original_standbys = self.mgr_cluster.get_standby_ids() + + victim = original_standbys[0] + self.mgr_cluster.mgr_stop(victim) + + expect_standbys = set(original_standbys) - {victim} + + self.wait_until_true( + lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys, + timeout=60 + ) + self.assertEqual(self.mgr_cluster.get_active_id(), original_active)