]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: lengthen health warning wait 50760/head
authorPatrick Donnelly <pdonnell@redhat.com>
Thu, 21 Oct 2021 01:43:54 +0000 (21:43 -0400)
committerPatrick Donnelly <pdonnell@redhat.com>
Thu, 30 Mar 2023 15:45:47 +0000 (11:45 -0400)
It's just a little too short!

Fixes: https://tracker.ceph.com/issues/52995
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit c8810e46e8f28ed204ef2d39f552f96f4a6cfe39)

Conflicts:
qa/tasks/cephfs/test_failover.py

qa/tasks/ceph_test_case.py
qa/tasks/cephfs/filesystem.py
qa/tasks/cephfs/test_failover.py
qa/tasks/cephfs/test_scrub_checks.py
qa/tasks/cephfs/test_snapshots.py

index 0de395c0682a7cfec0e794a3847aabcb4a39ca48..9234fc688ad8e9313fe8b9725b885b122bb76229 100644 (file)
@@ -161,6 +161,7 @@ class CephTestCase(unittest.TestCase):
             log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
             return False
 
+        log.info(f"waiting {timeout}s for health warning matching {pattern}")
         self.wait_until_true(seen_health_warning, timeout)
 
     def wait_for_health_clear(self, timeout):
index 35b80106dc8c5ab1976078cdfc82f0a3e7285f80..e778d1524390b073e9dddc98f732910ce45784fa 100644 (file)
@@ -473,6 +473,17 @@ class MDSCluster(CephCluster):
         for fs in self.status().get_filesystems():
             Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
 
+    @property
+    def beacon_timeout(self):
+        """
+        Generate an acceptable timeout for the mons to drive some MDSMap change
+        because of missed beacons from some MDS. This involves looking up the
+        grace period in use by the mons and adding an acceptable buffer.
+        """
+
+        grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
+        return grace*2+15
+
 
 class Filesystem(MDSCluster):
     """
index 74548930976c1a99f72ca275fa4c9fb998aed4fe..3d7637112b8b3c810761ebfcd72113c11759dd9b 100644 (file)
@@ -319,8 +319,6 @@ class TestFailover(CephFSTestCase):
         # Kill the rank 0 daemon's physical process
         self.fs.mds_stop(original_active)
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         # Wait until the monitor promotes his replacement
         def promoted():
             active = self.fs.get_active_names()
@@ -328,9 +326,7 @@ class TestFailover(CephFSTestCase):
 
         log.info("Waiting for promotion of one of the original standbys {0}".format(
             original_standbys))
-        self.wait_until_true(
-            promoted,
-            timeout=grace*2)
+        self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
 
         # Start the original rank 0 daemon up again, see that he becomes a standby
         self.fs.mds_restart(original_active)
@@ -352,8 +348,6 @@ class TestFailover(CephFSTestCase):
         if not require_active:
             self.skipTest("fuse_require_active_mds is not set")
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         # Check it's not laggy to begin with
         (original_active, ) = self.fs.get_active_names()
         self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
@@ -376,7 +370,7 @@ class TestFailover(CephFSTestCase):
 
             return True
 
-        self.wait_until_true(laggy, grace * 2)
+        self.wait_until_true(laggy, self.fs.beacon_timeout)
         with self.assertRaises(CommandFailedError):
             self.mounts[0].mount_wait()
 
@@ -388,8 +382,6 @@ class TestFailover(CephFSTestCase):
         # Need all my standbys up as well as the active daemons
         self.wait_for_daemon_start()
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         standbys = self.mds_cluster.get_standby_daemons()
         self.assertGreaterEqual(len(standbys), 1)
         self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
@@ -397,8 +389,7 @@ class TestFailover(CephFSTestCase):
         # Kill a standby and check for warning
         victim = standbys.pop()
         self.fs.mds_stop(victim)
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
 
         # restart the standby, see that he becomes a standby, check health clears
         self.fs.mds_restart(victim)
@@ -412,8 +403,7 @@ class TestFailover(CephFSTestCase):
         standbys = self.mds_cluster.get_standby_daemons()
         self.assertGreaterEqual(len(standbys), 1)
         self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
 
         # Set it to 0
         self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
@@ -429,7 +419,6 @@ class TestFailover(CephFSTestCase):
 
         self.mount_a.umount_wait()
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
         monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
 
         mds_0 = self.fs.get_rank(rank=0, status=status)
@@ -437,7 +426,7 @@ class TestFailover(CephFSTestCase):
         self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
         self.wait_until_true(
             lambda: "laggy_since" in self.fs.get_rank(),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
         )
 
         self.fs.rank_fail(rank=1)
@@ -450,7 +439,7 @@ class TestFailover(CephFSTestCase):
         self.fs.rank_signal(signal.SIGCONT, rank=0)
         self.wait_until_true(
             lambda: "laggy_since" not in self.fs.get_rank(rank=0),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
         )
 
         # mds.b will be stuck at 'reconnect' state if snapserver gets confused
index cdb33a4bba9cea0ebc9e85412388cbbc4aaac484..0e84f7ed2645cfdacc8b89f05359da68a1497595 100644 (file)
@@ -167,15 +167,13 @@ done
         # Kill the rank 0
         self.fs.mds_stop(original_active)
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         def promoted():
             active = self.fs.get_active_names()
             return active and active[0] in original_standbys
 
         log.info("Waiting for promotion of one of the original standbys {0}".format(
             original_standbys))
-        self.wait_until_true(promoted, timeout=grace*2)
+        self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
 
         self._check_task_status_na()
 
index 52a5850d35252e8ca1f61a4175b976b3469660e0..4462d8d7b9ac64f8f9d106484c41133624b8dc3f 100644 (file)
@@ -69,8 +69,6 @@ class TestSnapshots(CephFSTestCase):
         self.fs.set_max_mds(2)
         status = self.fs.wait_for_daemons()
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         # setup subtrees
         self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
         self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
@@ -91,7 +89,7 @@ class TestSnapshots(CephFSTestCase):
             self.fs.rank_freeze(True, rank=0)
             self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
             proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
             self.delete_mds_coredump(rank0['name']);
 
             self.fs.rank_fail(rank=0)
@@ -119,7 +117,7 @@ class TestSnapshots(CephFSTestCase):
             self.fs.rank_freeze(True, rank=1) # prevent failover...
             self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
             proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
             self.delete_mds_coredump(rank0['name']);
 
             self.fs.rank_signal(signal.SIGKILL, rank=1)
@@ -167,7 +165,7 @@ class TestSnapshots(CephFSTestCase):
             self.fs.rank_freeze(True, rank=1) # prevent failover...
             self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
             proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
             self.delete_mds_coredump(rank1['name']);
 
             self.mount_a.kill()
@@ -209,7 +207,7 @@ class TestSnapshots(CephFSTestCase):
         self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
         self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
         proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
         self.delete_mds_coredump(rank1['name']);
 
         self.mount_a.kill()
@@ -222,7 +220,7 @@ class TestSnapshots(CephFSTestCase):
         self.wait_for_daemon_start([rank1['name']])
 
         # rollback triggers assertion
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
         self.delete_mds_coredump(rank0['name']);
         self.fs.rank_fail(rank=0)
         self.fs.mds_restart(rank0['name'])
@@ -243,8 +241,6 @@ class TestSnapshots(CephFSTestCase):
         self.fs.set_max_mds(3)
         status = self.fs.wait_for_daemons()
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
         self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
         self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
         self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
@@ -301,7 +297,7 @@ class TestSnapshots(CephFSTestCase):
         self.fs.rank_freeze(True, rank=2)
         self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
         proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
         self.delete_mds_coredump(rank2['name']);
 
         # mksnap should wait for notify ack from mds.2
@@ -327,7 +323,7 @@ class TestSnapshots(CephFSTestCase):
             self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
             last_created = self._get_last_created_snap(rank=0)
             proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
             self.delete_mds_coredump(rank2['name']);
 
             self.mount_a.kill()