From 5e843162dadcbb4aef3f38250059c755693bdf06 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 19 Jul 2018 12:36:31 +0800 Subject: [PATCH] qa/tasks/cephfs: add test for discontinuous mdsmap Signed-off-by: "Yan, Zheng" --- qa/tasks/cephfs/filesystem.py | 6 +++++ qa/tasks/cephfs/test_failover.py | 46 ++++++++++++++++++++++++++++++++ qa/tasks/vstart_runner.py | 8 ++++++ 3 files changed, 60 insertions(+) diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 4b3cffa9ad075..b2837c2eee82d 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -269,6 +269,12 @@ class MDSCluster(CephCluster): self._one_or_all(mds_id, _fail_restart) + def mds_signal(self, mds_id, sig, silent=False): + """ + signal a MDS daemon + """ + self.mds_daemons[mds_id].signal(sig, silent); + def newfs(self, name='cephfs', create=True): return Filesystem(self._ctx, name=name, create=create) diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 97f848ba1b970..48a4327058388 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -1,3 +1,5 @@ +import time +import signal import json import logging from unittest import case, SkipTest @@ -283,8 +285,52 @@ class TestFailover(CephFSTestCase): self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0') self.wait_for_health_clear(timeout=30) + def test_discontinuous_mdsmap(self): + """ + That discontinuous mdsmap does not affect failover. + See http://tracker.ceph.com/issues/24856. + """ + mds_ids = sorted(self.mds_cluster.mds_ids) + mds_a, mds_b = mds_ids[0:2] + # Assign mds to fixed ranks. To prevent standby mds from replacing frozen mds + rank = 0; + for mds_id in mds_ids: + self.set_conf("mds.{0}".format(mds_id), "mds_standby_for_rank", str(rank)) + rank += 1 + self.mds_cluster.mds_restart() + self.fs.wait_for_daemons() + + self.fs.set_max_mds(2) + self.fs.wait_for_state('up:active', rank=1) + + self.mount_a.umount_wait() + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds")) + # Freeze mds_a + self.mds_cluster.mds_signal(mds_a, signal.SIGSTOP) + self.wait_until_true( + lambda: "laggy_since" in self.fs.status().get_mds(mds_a), + timeout=grace * 2 + ) + + self.mds_cluster.mds_restart(mds_b) + self.fs.wait_for_state('up:resolve', rank=1, timeout=30) + + # Make sure of mds_a's monitor connection gets reset + time.sleep(monc_timeout * 2) + + # Unfreeze mds_a, it will get discontinuous mdsmap + self.mds_cluster.mds_signal(mds_a, signal.SIGCONT) + self.wait_until_true( + lambda: "laggy_since" not in self.fs.status().get_mds(mds_a), + timeout=grace * 2 + ) + # mds.b will be stuck at 'reconnect' state if snapserver gets confused + # by discontinuous mdsmap + self.fs.wait_for_state('up:active', rank=1, timeout=30) class TestStandbyReplay(CephFSTestCase): MDSS_REQUIRED = 4 diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index 87e45e0f840bc..0462514b1bbe7 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -373,6 +373,14 @@ class LocalDaemon(object): self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id]) + def signal(self, sig, silent=False): + if not self.running(): + raise RuntimeError("Can't send signal to non-running daemon") + + os.kill(self._get_pid(), sig) + if not silent: + log.info("Sent signal {0} to {1}.{2}".format(sig, self.daemon_type, self.daemon_id)) + def safe_kill(pid): """ -- 2.39.5