From 7eac74933ff565405f50a04a7c7be86b76d1bc8a Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 20 Oct 2021 21:43:54 -0400 Subject: [PATCH] qa: lengthen health warning wait It's just a little too short! Fixes: https://tracker.ceph.com/issues/52995 Signed-off-by: Patrick Donnelly (cherry picked from commit c8810e46e8f28ed204ef2d39f552f96f4a6cfe39) Conflicts: qa/tasks/cephfs/test_failover.py --- qa/tasks/ceph_test_case.py | 1 + qa/tasks/cephfs/filesystem.py | 11 +++++++++++ qa/tasks/cephfs/test_failover.py | 23 ++++++----------------- qa/tasks/cephfs/test_scrub_checks.py | 4 +--- qa/tasks/cephfs/test_snapshots.py | 18 +++++++----------- 5 files changed, 26 insertions(+), 31 deletions(-) diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 0de395c0682a7..9234fc688ad8e 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -161,6 +161,7 @@ class CephTestCase(unittest.TestCase): log.debug("Not found expected summary strings yet ({0})".format(summary_strings)) return False + log.info(f"waiting {timeout}s for health warning matching {pattern}") self.wait_until_true(seen_health_warning, timeout) def wait_for_health_clear(self, timeout): diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index 35b80106dc8c5..e778d1524390b 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -473,6 +473,17 @@ class MDSCluster(CephCluster): for fs in self.status().get_filesystems(): Filesystem(ctx=self._ctx, fscid=fs['id']).destroy() + @property + def beacon_timeout(self): + """ + Generate an acceptable timeout for the mons to drive some MDSMap change + because of missed beacons from some MDS. This involves looking up the + grace period in use by the mons and adding an acceptable buffer. + """ + + grace = float(self.get_config("mds_beacon_grace", service_type="mon")) + return grace*2+15 + class Filesystem(MDSCluster): """ diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 74548930976c1..3d7637112b8b3 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -319,8 +319,6 @@ class TestFailover(CephFSTestCase): # Kill the rank 0 daemon's physical process self.fs.mds_stop(original_active) - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - # Wait until the monitor promotes his replacement def promoted(): active = self.fs.get_active_names() @@ -328,9 +326,7 @@ class TestFailover(CephFSTestCase): log.info("Waiting for promotion of one of the original standbys {0}".format( original_standbys)) - self.wait_until_true( - promoted, - timeout=grace*2) + self.wait_until_true(promoted, timeout=self.fs.beacon_timeout) # Start the original rank 0 daemon up again, see that he becomes a standby self.fs.mds_restart(original_active) @@ -352,8 +348,6 @@ class TestFailover(CephFSTestCase): if not require_active: self.skipTest("fuse_require_active_mds is not set") - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - # Check it's not laggy to begin with (original_active, ) = self.fs.get_active_names() self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active)) @@ -376,7 +370,7 @@ class TestFailover(CephFSTestCase): return True - self.wait_until_true(laggy, grace * 2) + self.wait_until_true(laggy, self.fs.beacon_timeout) with self.assertRaises(CommandFailedError): self.mounts[0].mount_wait() @@ -388,8 +382,6 @@ class TestFailover(CephFSTestCase): # Need all my standbys up as well as the active daemons self.wait_for_daemon_start() - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - standbys = self.mds_cluster.get_standby_daemons() self.assertGreaterEqual(len(standbys), 1) self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys))) @@ -397,8 +389,7 @@ class TestFailover(CephFSTestCase): # Kill a standby and check for warning victim = standbys.pop() self.fs.mds_stop(victim) - log.info("waiting for insufficient standby daemon warning") - self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout) # restart the standby, see that he becomes a standby, check health clears self.fs.mds_restart(victim) @@ -412,8 +403,7 @@ class TestFailover(CephFSTestCase): standbys = self.mds_cluster.get_standby_daemons() self.assertGreaterEqual(len(standbys), 1) self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1)) - log.info("waiting for insufficient standby daemon warning") - self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout) # Set it to 0 self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0') @@ -429,7 +419,6 @@ class TestFailover(CephFSTestCase): self.mount_a.umount_wait() - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds")) mds_0 = self.fs.get_rank(rank=0, status=status) @@ -437,7 +426,7 @@ class TestFailover(CephFSTestCase): self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status) self.wait_until_true( lambda: "laggy_since" in self.fs.get_rank(), - timeout=grace * 2 + timeout=self.fs.beacon_timeout ) self.fs.rank_fail(rank=1) @@ -450,7 +439,7 @@ class TestFailover(CephFSTestCase): self.fs.rank_signal(signal.SIGCONT, rank=0) self.wait_until_true( lambda: "laggy_since" not in self.fs.get_rank(rank=0), - timeout=grace * 2 + timeout=self.fs.beacon_timeout ) # mds.b will be stuck at 'reconnect' state if snapserver gets confused diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py index cdb33a4bba9ce..0e84f7ed2645c 100644 --- a/qa/tasks/cephfs/test_scrub_checks.py +++ b/qa/tasks/cephfs/test_scrub_checks.py @@ -167,15 +167,13 @@ done # Kill the rank 0 self.fs.mds_stop(original_active) - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - def promoted(): active = self.fs.get_active_names() return active and active[0] in original_standbys log.info("Waiting for promotion of one of the original standbys {0}".format( original_standbys)) - self.wait_until_true(promoted, timeout=grace*2) + self.wait_until_true(promoted, timeout=self.fs.beacon_timeout) self._check_task_status_na() diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py index 52a5850d35252..4462d8d7b9ac6 100644 --- a/qa/tasks/cephfs/test_snapshots.py +++ b/qa/tasks/cephfs/test_snapshots.py @@ -69,8 +69,6 @@ class TestSnapshots(CephFSTestCase): self.fs.set_max_mds(2) status = self.fs.wait_for_daemons() - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - # setup subtrees self.mount_a.run_shell(["mkdir", "-p", "d1/dir"]) self.mount_a.setfattr("d1", "ceph.dir.pin", "1") @@ -91,7 +89,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_freeze(True, rank=0) self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank0['name']); self.fs.rank_fail(rank=0) @@ -119,7 +117,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_freeze(True, rank=1) # prevent failover... self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank0['name']); self.fs.rank_signal(signal.SIGKILL, rank=1) @@ -167,7 +165,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_freeze(True, rank=1) # prevent failover... self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status) proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank1['name']); self.mount_a.kill() @@ -209,7 +207,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status) self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status) proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank1['name']); self.mount_a.kill() @@ -222,7 +220,7 @@ class TestSnapshots(CephFSTestCase): self.wait_for_daemon_start([rank1['name']]) # rollback triggers assertion - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank0['name']); self.fs.rank_fail(rank=0) self.fs.mds_restart(rank0['name']) @@ -243,8 +241,6 @@ class TestSnapshots(CephFSTestCase): self.fs.set_max_mds(3) status = self.fs.wait_for_daemons() - grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) - self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"]) self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"]) self.mount_a.setfattr("d0", "ceph.dir.pin", "0") @@ -301,7 +297,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_freeze(True, rank=2) self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status) proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank2['name']); # mksnap should wait for notify ack from mds.2 @@ -327,7 +323,7 @@ class TestSnapshots(CephFSTestCase): self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status) last_created = self._get_last_created_snap(rank=0) proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2); + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout); self.delete_mds_coredump(rank2['name']); self.mount_a.kill() -- 2.39.5