qa: lengthen health warning wait

author Patrick Donnelly <pdonnell@redhat.com>

Thu, 21 Oct 2021 01:43:54 +0000 (21:43 -0400)

committer Patrick Donnelly <pdonnell@redhat.com>

Tue, 26 Oct 2021 18:26:19 +0000 (14:26 -0400)
author Patrick Donnelly <pdonnell@redhat.com>
Thu, 21 Oct 2021 01:43:54 +0000 (21:43 -0400)
committer Patrick Donnelly <pdonnell@redhat.com>
Tue, 26 Oct 2021 18:26:19 +0000 (14:26 -0400)
diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py

index 0ce9e81f1c138bef997bf77ae8862d423b4384ac..8f5b27e28a8184d419c56f46bef89ce1168b0fdd 100644 (file)
--- a/qa/tasks/ceph_test_case.py
+++ b/qa/tasks/ceph_test_case.py
@@ -156,6 +156,7 @@ class CephTestCase(unittest.TestCase):
              log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
              return False
  
+        log.info(f"waiting {timeout}s for health warning matching {pattern}")
          self.wait_until_true(seen_health_warning, timeout)
  
      def wait_for_health_clear(self, timeout):
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py

index 0f846cf1e0e632834b3c5b479fd5066365c8d2df..9e079c9c80c0dcef8a94d768129f9075718c8209 100644 (file)
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -478,6 +478,17 @@ class MDSCluster(CephCluster):
          for fs in self.status().get_filesystems():
              Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
  
+    @property
+    def beacon_timeout(self):
+        """
+        Generate an acceptable timeout for the mons to drive some MDSMap change
+        because of missed beacons from some MDS. This involves looking up the
+        grace period in use by the mons and adding an acceptable buffer.
+        """
+
+        grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
+        return grace*2+15
+
  
  class Filesystem(MDSCluster):
      """
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py

index 7791e090e0c6116fe0b9df0d5745605844d844e4..7558ede168483ec197bcfb34d25f9a072c608b9f 100644 (file)
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -306,8 +306,6 @@ class TestFailover(CephFSTestCase):
          in thrashing tests.
          """
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          (original_active, ) = self.fs.get_active_names()
          original_standbys = self.mds_cluster.get_standby_daemons()
  
@@ -321,7 +319,7 @@ class TestFailover(CephFSTestCase):
  
          log.info("Waiting for promotion of one of the original standbys {0}".format(
              original_standbys))
-        self.wait_until_true(promoted, timeout=grace*2)
+        self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
  
          # Start the original rank 0 daemon up again, see that he becomes a standby
          self.fs.mds_restart(original_active)
@@ -343,8 +341,6 @@ class TestFailover(CephFSTestCase):
          if not require_active:
              self.skipTest("fuse_require_active_mds is not set")
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          # Check it's not laggy to begin with
          (original_active, ) = self.fs.get_active_names()
          self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
@@ -367,7 +363,7 @@ class TestFailover(CephFSTestCase):
  
              return True
  
-        self.wait_until_true(laggy, grace * 2)
+        self.wait_until_true(laggy, self.fs.beacon_timeout)
          with self.assertRaises(CommandFailedError):
              self.mounts[0].mount_wait()
  
@@ -379,8 +375,6 @@ class TestFailover(CephFSTestCase):
          # Need all my standbys up as well as the active daemons
          self.wait_for_daemon_start()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          standbys = self.mds_cluster.get_standby_daemons()
          self.assertGreaterEqual(len(standbys), 1)
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
@@ -388,8 +382,7 @@ class TestFailover(CephFSTestCase):
          # Kill a standby and check for warning
          victim = standbys.pop()
          self.fs.mds_stop(victim)
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
  
          # restart the standby, see that he becomes a standby, check health clears
          self.fs.mds_restart(victim)
@@ -403,8 +396,7 @@ class TestFailover(CephFSTestCase):
          standbys = self.mds_cluster.get_standby_daemons()
          self.assertGreaterEqual(len(standbys), 1)
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
-        log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
  
          # Set it to 0
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
@@ -420,7 +412,6 @@ class TestFailover(CephFSTestCase):
  
          self.mount_a.umount_wait()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
          monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
  
          mds_0 = self.fs.get_rank(rank=0, status=status)
@@ -428,7 +419,7 @@ class TestFailover(CephFSTestCase):
          self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
          self.wait_until_true(
              lambda: "laggy_since" in self.fs.get_rank(),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
          )
  
          self.fs.rank_fail(rank=1)
@@ -441,7 +432,7 @@ class TestFailover(CephFSTestCase):
          self.fs.rank_signal(signal.SIGCONT, rank=0)
          self.wait_until_true(
              lambda: "laggy_since" not in self.fs.get_rank(rank=0),
-            timeout=grace * 2
+            timeout=self.fs.beacon_timeout
          )
  
          # mds.b will be stuck at 'reconnect' state if snapserver gets confused
diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py

index bcfc2fc9a3a1f368b9dc5de299c6c7db63e6a02e..ac86b24f0a57f16ac26859408a072c5a8c7e5d1a 100644 (file)
--- a/qa/tasks/cephfs/test_scrub_checks.py
+++ b/qa/tasks/cephfs/test_scrub_checks.py
@@ -168,15 +168,13 @@ done
          # Kill the rank 0
          self.fs.mds_stop(original_active)
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          def promoted():
              active = self.fs.get_active_names()
              return active and active[0] in original_standbys
  
          log.info("Waiting for promotion of one of the original standbys {0}".format(
              original_standbys))
-        self.wait_until_true(promoted, timeout=grace*2)
+        self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
  
          self._check_task_status_na()
  
diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py

index 30d4e5801e0df6e084ddff9e025953dd56d9f577..bc3e6a16c8a5a760d05e71f5262525fc59f4469c 100644 (file)
--- a/qa/tasks/cephfs/test_snapshots.py
+++ b/qa/tasks/cephfs/test_snapshots.py
@@ -70,8 +70,6 @@ class TestSnapshots(CephFSTestCase):
          self.fs.set_max_mds(2)
          status = self.fs.wait_for_daemons()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          # setup subtrees
          self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
          self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
@@ -92,7 +90,7 @@ class TestSnapshots(CephFSTestCase):
              self.fs.rank_freeze(True, rank=0)
              self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
              proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
              self.delete_mds_coredump(rank0['name']);
  
              self.fs.rank_fail(rank=0)
@@ -120,7 +118,7 @@ class TestSnapshots(CephFSTestCase):
              self.fs.rank_freeze(True, rank=1) # prevent failover...
              self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
              proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
              self.delete_mds_coredump(rank0['name']);
  
              self.fs.rank_signal(signal.SIGKILL, rank=1)
@@ -168,7 +166,7 @@ class TestSnapshots(CephFSTestCase):
              self.fs.rank_freeze(True, rank=1) # prevent failover...
              self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
              proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
              self.delete_mds_coredump(rank1['name']);
  
              self.mount_a.kill()
@@ -210,7 +208,7 @@ class TestSnapshots(CephFSTestCase):
          self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
          self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
          proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
          self.delete_mds_coredump(rank1['name']);
  
          self.mount_a.kill()
@@ -223,7 +221,7 @@ class TestSnapshots(CephFSTestCase):
          self.wait_for_daemon_start([rank1['name']])
  
          # rollback triggers assertion
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
          self.delete_mds_coredump(rank0['name']);
          self.fs.rank_fail(rank=0)
          self.fs.mds_restart(rank0['name'])
@@ -244,8 +242,6 @@ class TestSnapshots(CephFSTestCase):
          self.fs.set_max_mds(3)
          status = self.fs.wait_for_daemons()
  
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
-
          self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
          self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
          self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
@@ -302,7 +298,7 @@ class TestSnapshots(CephFSTestCase):
          self.fs.rank_freeze(True, rank=2)
          self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
          proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
-        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+        self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
          self.delete_mds_coredump(rank2['name']);
  
          # mksnap should wait for notify ack from mds.2
@@ -328,7 +324,7 @@ class TestSnapshots(CephFSTestCase):
              self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
              last_created = self._get_last_created_snap(rank=0)
              proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
-            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+            self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
              self.delete_mds_coredump(rank2['name']);
  
              self.mount_a.kill()
author	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 21 Oct 2021 01:43:54 +0000 (21:43 -0400)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Tue, 26 Oct 2021 18:26:19 +0000 (14:26 -0400)
qa/tasks/ceph_test_case.py		patch \| blob \| history
qa/tasks/cephfs/filesystem.py		patch \| blob \| history
qa/tasks/cephfs/test_failover.py		patch \| blob \| history
qa/tasks/cephfs/test_scrub_checks.py		patch \| blob \| history
qa/tasks/cephfs/test_snapshots.py		patch \| blob \| history