qa: simplify tests which stop MDS ranks

author Patrick Donnelly <pdonnell@redhat.com>

Thu, 11 Mar 2021 19:06:23 +0000 (11:06 -0800)

committer Patrick Donnelly <pdonnell@redhat.com>

Wed, 31 Mar 2021 14:29:54 +0000 (07:29 -0700)
author Patrick Donnelly <pdonnell@redhat.com>
Thu, 11 Mar 2021 19:06:23 +0000 (11:06 -0800)
committer Patrick Donnelly <pdonnell@redhat.com>
Wed, 31 Mar 2021 14:29:54 +0000 (07:29 -0700)
diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py

index 57ddc48d5c3d80d747ab9917edd6090c8f0b699e..9ef7989d9468d67a6631c1be088a09e9f2be0a0c 100644 (file)
--- a/qa/tasks/cephfs/cephfs_test_case.py
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -195,7 +195,6 @@ class CephFSTestCase(CephTestCase):
              self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
              self.recovery_fs.create()
              self.recovery_fs.getinfo(refresh=True)
-            self.recovery_fs.mds_restart()
              self.recovery_fs.wait_for_daemons()
  
          # Load an config settings of interest
diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py

index 141be9883d02225d2c497338ff787c9abbdc2a24..63a98c830dae416b69f4d3a7025a24ad57e7d602 100644 (file)
--- a/qa/tasks/cephfs/test_auto_repair.py
+++ b/qa/tasks/cephfs/test_auto_repair.py
@@ -36,8 +36,7 @@ class TestMDSAutoRepair(CephFSTestCase):
  
          # Restart the MDS to drop the metadata cache (because we expired the journal,
          # nothing gets replayed into cache on restart)
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
          self.fs.wait_for_daemons()
  
          # remove testdir1's backtrace
diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py

index 0b4910ccb477209ecc8b2868e586f4c2495611ba..2fc9410d13a77bb12d5fffe01ab6c6515eae5e55 100644 (file)
--- a/qa/tasks/cephfs/test_cap_flush.py
+++ b/qa/tasks/cephfs/test_cap_flush.py
@@ -49,8 +49,7 @@ class TestCapFlush(CephFSTestCase):
          time.sleep(10)
  
          # Restart mds. Client will re-send the unsafe request and cap flush
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
          self.fs.wait_for_daemons()
  
          mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py

index ab2fe239c1c6a5d2717e8e89b5f51b35b30099f0..3ae208a69925b6706c7cee264238af471dd6b321 100644 (file)
--- a/qa/tasks/cephfs/test_client_recovery.py
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -135,13 +135,12 @@ class TestClientRecovery(CephFSTestCase):
          # =================
          # Check that if I stop an MDS and a client goes away, the MDS waits
          # for the reconnect period
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          mount_a_client_id = self.mount_a.get_global_id()
          self.mount_a.umount_wait(force=True)
  
-        self.fs.mds_restart()
+        self.fs.set_joinable()
  
          self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
          # Check that the MDS locally reports its state correctly
@@ -178,8 +177,7 @@ class TestClientRecovery(CephFSTestCase):
          # =========================
          mount_a_client_id = self.mount_a.get_global_id()
  
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # The mount goes away while the MDS is offline
          self.mount_a.kill()
@@ -187,7 +185,7 @@ class TestClientRecovery(CephFSTestCase):
          # wait for it to die
          time.sleep(5)
  
-        self.fs.mds_restart()
+        self.fs.set_joinable()
  
          # Enter reconnect phase
          self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
@@ -468,13 +466,12 @@ class TestClientRecovery(CephFSTestCase):
          )
  
          # Immediately kill the MDS and then client A
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
          self.mount_a.kill()
          self.mount_a.kill_cleanup()
  
          # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          log.info("Waiting for reconnect...")
          self.fs.wait_for_state("up:reconnect")
          log.info("Waiting for active...")
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py

index c6067dc320620f031c29cf994354f1872e954207..7e95032a0dc7d41d41813a174a591f2157f51a8e 100644 (file)
--- a/qa/tasks/cephfs/test_damage.py
+++ b/qa/tasks/cephfs/test_damage.py
@@ -51,8 +51,7 @@ class TestDamage(CephFSTestCase):
          for mds_name in self.fs.get_active_names():
              self.fs.mds_asok(["flush", "journal"], mds_name)
  
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          self.fs.rados(['export', '/tmp/metadata.bin'])
  
@@ -242,8 +241,7 @@ class TestDamage(CephFSTestCase):
  
              # Reset MDS state
              self.mount_a.umount_wait(force=True)
-            self.fs.mds_stop()
-            self.fs.mds_fail()
+            self.fs.fail()
              self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
  
              # Reset RADOS pool state
@@ -253,7 +251,7 @@ class TestDamage(CephFSTestCase):
              mutation.mutate_fn()
  
              # Try starting the MDS
-            self.fs.mds_restart()
+            self.fs.set_joinable()
  
              # How long we'll wait between starting a daemon and expecting
              # it to make it through startup, and potentially declare itself
@@ -388,8 +386,7 @@ class TestDamage(CephFSTestCase):
          for mds_name in self.fs.get_active_names():
              self.fs.mds_asok(["flush", "journal"], mds_name)
  
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # Corrupt a dentry
          junk = "deadbeef" * 10
@@ -397,7 +394,7 @@ class TestDamage(CephFSTestCase):
          self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
  
          # Start up and try to list it
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
  
          self.mount_a.mount_wait()
@@ -497,9 +494,9 @@ class TestDamage(CephFSTestCase):
          self.fs.mds_asok(["flush", "journal"])
  
          # Drop everything from the MDS cache
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
          self.fs.journal_tool(['journal', 'reset'], 0)
-        self.mds_cluster.mds_fail_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
  
          self.mount_a.mount_wait()
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py

index 0f0a352a2ac27a3461e011a9361f22587106dff5..d3d11c6b77e53503e586cf9944cae06b89a35a15 100644 (file)
--- a/qa/tasks/cephfs/test_data_scan.py
+++ b/qa/tasks/cephfs/test_data_scan.py
@@ -328,8 +328,7 @@ class TestDataScan(CephFSTestCase):
          workload.flush()
  
          # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # After recovery, we need the MDS to not be strict about stats (in production these options
          # are off by default, but in QA we need to explicitly disable them)
@@ -343,7 +342,7 @@ class TestDataScan(CephFSTestCase):
          # only understands how to rebuild metadata under rank 0
          self.fs.reset()
  
-        self.fs.mds_restart()
+        self.fs.set_joinable()
  
          def get_state(mds_id):
              info = self.mds_cluster.get_mds_info(mds_id)
@@ -458,8 +457,7 @@ class TestDataScan(CephFSTestCase):
          # Flush journal and stop MDS
          self.mount_a.umount_wait()
          self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # Pick a dentry and wipe out its key
          # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
@@ -471,7 +469,7 @@ class TestDataScan(CephFSTestCase):
          self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
  
          # Start filesystem back up, observe that the file appears to be gone in an `ls`
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
          self.mount_a.mount_wait()
          files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
@@ -479,8 +477,7 @@ class TestDataScan(CephFSTestCase):
  
          # Stop the filesystem
          self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # Run data-scan, observe that it inserts our dentry back into the correct fragment
          # by checking the omap now has the dentry's key again
@@ -491,7 +488,7 @@ class TestDataScan(CephFSTestCase):
  
          # Start the filesystem and check that the dentry we deleted is now once again visible
          # and points to the correct file data.
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
          self.mount_a.mount_wait()
          out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
@@ -593,8 +590,7 @@ class TestDataScan(CephFSTestCase):
          self.mount_a.umount_wait()
  
          self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          # repair linkage errors
          self.fs.data_scan(["scan_links"])
@@ -602,7 +598,7 @@ class TestDataScan(CephFSTestCase):
          # primary link in testdir2 was deleted?
          self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
  
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
  
          self.mount_a.mount_wait()
@@ -641,7 +637,7 @@ class TestDataScan(CephFSTestCase):
  
          self.fs.mds_asok(["flush", "journal"], mds0_id)
          self.fs.mds_asok(["flush", "journal"], mds1_id)
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
  
          self.fs.rados(["rm", "mds0_inotable"])
          self.fs.rados(["rm", "mds1_inotable"])
diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py

index 59ca24845095b0f5df0e7ae36dc3c4542393e078..69b799fc082f0db4b304f243bbd4fba0336ca964 100644 (file)
--- a/qa/tasks/cephfs/test_forward_scrub.py
+++ b/qa/tasks/cephfs/test_forward_scrub.py
@@ -146,14 +146,13 @@ class TestForwardScrub(CephFSTestCase):
          # Orphan an inode by deleting its dentry
          # Our victim will be.... bravo.
          self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
          self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
          self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
          frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
          self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
  
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
  
          # See that the orphaned file is indeed missing from a client's POV
@@ -177,8 +176,7 @@ class TestForwardScrub(CephFSTestCase):
          self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
  
          # Run cephfs-data-scan targeting only orphans
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
          self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
          self.fs.data_scan([
              "scan_inodes",
@@ -192,7 +190,7 @@ class TestForwardScrub(CephFSTestCase):
  
          # And we should have all the same linkage we started with,
          # and no lost+found, and no extra inodes!
-        self.fs.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
          self.mount_a.mount_wait()
          self._validate_linkage(inos)
@@ -239,8 +237,7 @@ class TestForwardScrub(CephFSTestCase):
              self.assertEqual(out_json["return_code"], 0)
              self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
  
-        self.mds_cluster.mds_stop()
-        self.mds_cluster.mds_fail()
+        self.fs.fail()
  
          # Truncate the journal (to ensure the inotable on disk
          # is all that will be in the InoTable in memory)
@@ -255,7 +252,7 @@ class TestForwardScrub(CephFSTestCase):
          for key, value in inotable_copy.items():
             self.fs.put_metadata_object_raw(key, value)
  
-        self.mds_cluster.mds_restart()
+        self.fs.set_joinable()
          self.fs.wait_for_daemons()
  
          with self.assert_cluster_log("inode table repaired"):
@@ -264,7 +261,7 @@ class TestForwardScrub(CephFSTestCase):
              self.assertEqual(out_json["return_code"], 0)
              self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
  
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
          table_text = self.fs.table_tool(["0", "show", "inode"])
          table = json.loads(table_text)
          self.assertGreater(
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py

index 6a295bbfdf1aca7dcf442385b994c1e3e4e69c6e..b8740136067aa84d817f22dea4dcad20bc4edeb2 100644 (file)
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -70,8 +70,7 @@ class TestMisc(CephFSTestCase):
  
          data_pool_name = self.fs.get_data_pool_name()
  
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
  
          self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                              '--yes-i-really-mean-it')
@@ -109,10 +108,11 @@ class TestMisc(CephFSTestCase):
                                              self.fs.metadata_pool_name,
                                              data_pool_name, "--force")
  
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
+
          self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                              '--yes-i-really-mean-it')
  
-
          self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
                                              self.fs.metadata_pool_name,
                                              self.fs.metadata_pool_name,
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py

index 2838f67b0dba3c19959835bcb6950d053f06b4ec..7658f0ab32aaaf05afefb845f4f2f9b38dade2d0 100644 (file)
--- a/qa/tasks/cephfs/test_recovery_pool.py
+++ b/qa/tasks/cephfs/test_recovery_pool.py
@@ -121,8 +121,8 @@ class TestRecoveryPool(CephFSTestCase):
          self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
  
          # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+        self.fs.fail()
  
          # After recovery, we need the MDS to not be strict about stats (in production these options
          # are off by default, but in QA we need to explicitly disable them)
@@ -146,7 +146,6 @@ class TestRecoveryPool(CephFSTestCase):
                  # Normal reset should fail when no objects are present, we'll use --force instead
                  self.fs.journal_tool(["journal", "reset"], 0)
  
-        self.fs.mds_stop()
          self.fs.data_scan(['scan_extents', '--alternate-pool',
                             recovery_pool, '--filesystem', self.fs.name,
                             self.fs.get_data_pool_name()])
@@ -174,6 +173,7 @@ class TestRecoveryPool(CephFSTestCase):
  
          # Start the MDS
          self.fs.mds_restart()
+        self.fs.set_joinable()
          self.recovery_fs.mds_restart()
          self.fs.wait_for_daemons()
          self.recovery_fs.wait_for_daemons()
diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py

index 01e9dc8f6f3f8ce71e6c55b30d0a4e8bcdba7ead..37f0f98f71d1811825146c21072ec92d129684fd 100644 (file)
--- a/qa/tasks/cephfs/test_scrub.py
+++ b/qa/tasks/cephfs/test_scrub.py
@@ -12,8 +12,9 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
  
  
  class Workload(CephFSTestCase):
-    def __init__(self, filesystem, mount):
+    def __init__(self, test, filesystem, mount):
          super().__init__()
+        self._test =  test
          self._mount = mount
          self._filesystem = filesystem
          self._initial_state = None
@@ -94,13 +95,13 @@ class DupInodeWorkload(Workload):
          temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
          self._mount.umount_wait()
          self._filesystem.mds_asok(["flush", "journal"])
-        self._filesystem.mds_stop()
+        self._filesystem.fail()
          self._filesystem.rados(["getomapval", "10000000000.00000000",
                                  "parentfile_head", temp_bin_path])
          self._filesystem.rados(["setomapval", "10000000000.00000000",
                                  "shadow_head"], stdin_file=temp_bin_path)
-        self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
-        self._filesystem.mds_restart()
+        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
+        self._filesystem.set_joinable()
          self._filesystem.wait_for_daemons()
  
      def validate(self):
@@ -170,10 +171,10 @@ class TestScrub(CephFSTestCase):
          self.assertEqual(self._get_damage_count(), 0)
  
      def test_scrub_backtrace_for_new_files(self):
-        self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
  
      def test_scrub_backtrace(self):
-        self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
  
      def test_scrub_dup_inode(self):
-        self._scrub(DupInodeWorkload(self.fs, self.mount_a))
+        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))
author	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 11 Mar 2021 19:06:23 +0000 (11:06 -0800)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Wed, 31 Mar 2021 14:29:54 +0000 (07:29 -0700)
qa/tasks/cephfs/cephfs_test_case.py		patch \| blob \| history
qa/tasks/cephfs/test_auto_repair.py		patch \| blob \| history
qa/tasks/cephfs/test_cap_flush.py		patch \| blob \| history
qa/tasks/cephfs/test_client_recovery.py		patch \| blob \| history
qa/tasks/cephfs/test_damage.py		patch \| blob \| history
qa/tasks/cephfs/test_data_scan.py		patch \| blob \| history
qa/tasks/cephfs/test_forward_scrub.py		patch \| blob \| history
qa/tasks/cephfs/test_misc.py		patch \| blob \| history
qa/tasks/cephfs/test_recovery_pool.py		patch \| blob \| history
qa/tasks/cephfs/test_scrub.py		patch \| blob \| history