From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 11 Mar 2021 19:06:23 +0000 (-0800)
Subject: qa: simplify tests which stop MDS ranks
X-Git-Tag: v16.2.2~8^2~5^2~4
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d7f973393cfb39ca919538aef048280ba4a0d8a5;p=ceph.git

qa: simplify tests which stop MDS ranks

Instead of stopping MDS daemons and individually failing MDS daemons,
just fail the ranks or the entire file system, where possible.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 0825d6aa9e1997d21b8a32f36c6c0544bf3ff40b)
---

diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py
index 57ddc48d5c3d..9ef7989d9468 100644
--- a/qa/tasks/cephfs/cephfs_test_case.py
+++ b/qa/tasks/cephfs/cephfs_test_case.py
@@ -195,7 +195,6 @@ class CephFSTestCase(CephTestCase):
             self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
             self.recovery_fs.create()
             self.recovery_fs.getinfo(refresh=True)
-            self.recovery_fs.mds_restart()
             self.recovery_fs.wait_for_daemons()
 
         # Load an config settings of interest
diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py
index 141be9883d02..63a98c830dae 100644
--- a/qa/tasks/cephfs/test_auto_repair.py
+++ b/qa/tasks/cephfs/test_auto_repair.py
@@ -36,8 +36,7 @@ class TestMDSAutoRepair(CephFSTestCase):
 
         # Restart the MDS to drop the metadata cache (because we expired the journal,
         # nothing gets replayed into cache on restart)
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
         self.fs.wait_for_daemons()
 
         # remove testdir1's backtrace
diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py
index 0b4910ccb477..2fc9410d13a7 100644
--- a/qa/tasks/cephfs/test_cap_flush.py
+++ b/qa/tasks/cephfs/test_cap_flush.py
@@ -49,8 +49,7 @@ class TestCapFlush(CephFSTestCase):
         time.sleep(10)
 
         # Restart mds. Client will re-send the unsafe request and cap flush
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
         self.fs.wait_for_daemons()
 
         mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py
index ab2fe239c1c6..3ae208a69925 100644
--- a/qa/tasks/cephfs/test_client_recovery.py
+++ b/qa/tasks/cephfs/test_client_recovery.py
@@ -135,13 +135,12 @@ class TestClientRecovery(CephFSTestCase):
         # =================
         # Check that if I stop an MDS and a client goes away, the MDS waits
         # for the reconnect period
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         mount_a_client_id = self.mount_a.get_global_id()
         self.mount_a.umount_wait(force=True)
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
         # Check that the MDS locally reports its state correctly
@@ -178,8 +177,7 @@ class TestClientRecovery(CephFSTestCase):
         # =========================
         mount_a_client_id = self.mount_a.get_global_id()
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # The mount goes away while the MDS is offline
         self.mount_a.kill()
@@ -187,7 +185,7 @@ class TestClientRecovery(CephFSTestCase):
         # wait for it to die
         time.sleep(5)
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         # Enter reconnect phase
         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
@@ -468,13 +466,12 @@ class TestClientRecovery(CephFSTestCase):
         )
 
         # Immediately kill the MDS and then client A
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.mount_a.kill()
         self.mount_a.kill_cleanup()
 
         # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         log.info("Waiting for reconnect...")
         self.fs.wait_for_state("up:reconnect")
         log.info("Waiting for active...")
diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py
index c6067dc32062..7e95032a0dc7 100644
--- a/qa/tasks/cephfs/test_damage.py
+++ b/qa/tasks/cephfs/test_damage.py
@@ -51,8 +51,7 @@ class TestDamage(CephFSTestCase):
         for mds_name in self.fs.get_active_names():
             self.fs.mds_asok(["flush", "journal"], mds_name)
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         self.fs.rados(['export', '/tmp/metadata.bin'])
 
@@ -242,8 +241,7 @@ class TestDamage(CephFSTestCase):
 
             # Reset MDS state
             self.mount_a.umount_wait(force=True)
-            self.fs.mds_stop()
-            self.fs.mds_fail()
+            self.fs.fail()
             self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
 
             # Reset RADOS pool state
@@ -253,7 +251,7 @@ class TestDamage(CephFSTestCase):
             mutation.mutate_fn()
 
             # Try starting the MDS
-            self.fs.mds_restart()
+            self.fs.set_joinable()
 
             # How long we'll wait between starting a daemon and expecting
             # it to make it through startup, and potentially declare itself
@@ -388,8 +386,7 @@ class TestDamage(CephFSTestCase):
         for mds_name in self.fs.get_active_names():
             self.fs.mds_asok(["flush", "journal"], mds_name)
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Corrupt a dentry
         junk = "deadbeef" * 10
@@ -397,7 +394,7 @@ class TestDamage(CephFSTestCase):
         self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
 
         # Start up and try to list it
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
@@ -497,9 +494,9 @@ class TestDamage(CephFSTestCase):
         self.fs.mds_asok(["flush", "journal"])
 
         # Drop everything from the MDS cache
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
         self.fs.journal_tool(['journal', 'reset'], 0)
-        self.mds_cluster.mds_fail_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py
index 0f0a352a2ac2..d3d11c6b77e5 100644
--- a/qa/tasks/cephfs/test_data_scan.py
+++ b/qa/tasks/cephfs/test_data_scan.py
@@ -328,8 +328,7 @@ class TestDataScan(CephFSTestCase):
         workload.flush()
 
         # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # After recovery, we need the MDS to not be strict about stats (in production these options
         # are off by default, but in QA we need to explicitly disable them)
@@ -343,7 +342,7 @@ class TestDataScan(CephFSTestCase):
         # only understands how to rebuild metadata under rank 0
         self.fs.reset()
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         def get_state(mds_id):
             info = self.mds_cluster.get_mds_info(mds_id)
@@ -458,8 +457,7 @@ class TestDataScan(CephFSTestCase):
         # Flush journal and stop MDS
         self.mount_a.umount_wait()
         self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Pick a dentry and wipe out its key
         # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
@@ -471,7 +469,7 @@ class TestDataScan(CephFSTestCase):
         self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
 
         # Start filesystem back up, observe that the file appears to be gone in an `ls`
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
@@ -479,8 +477,7 @@ class TestDataScan(CephFSTestCase):
 
         # Stop the filesystem
         self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Run data-scan, observe that it inserts our dentry back into the correct fragment
         # by checking the omap now has the dentry's key again
@@ -491,7 +488,7 @@ class TestDataScan(CephFSTestCase):
 
         # Start the filesystem and check that the dentry we deleted is now once again visible
         # and points to the correct file data.
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
@@ -593,8 +590,7 @@ class TestDataScan(CephFSTestCase):
         self.mount_a.umount_wait()
 
         self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # repair linkage errors
         self.fs.data_scan(["scan_links"])
@@ -602,7 +598,7 @@ class TestDataScan(CephFSTestCase):
         # primary link in testdir2 was deleted?
         self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
@@ -641,7 +637,7 @@ class TestDataScan(CephFSTestCase):
 
         self.fs.mds_asok(["flush", "journal"], mds0_id)
         self.fs.mds_asok(["flush", "journal"], mds1_id)
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
 
         self.fs.rados(["rm", "mds0_inotable"])
         self.fs.rados(["rm", "mds1_inotable"])
diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py
index 59ca24845095..69b799fc082f 100644
--- a/qa/tasks/cephfs/test_forward_scrub.py
+++ b/qa/tasks/cephfs/test_forward_scrub.py
@@ -146,14 +146,13 @@ class TestForwardScrub(CephFSTestCase):
         # Orphan an inode by deleting its dentry
         # Our victim will be.... bravo.
         self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
         self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
         frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
         self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         # See that the orphaned file is indeed missing from a client's POV
@@ -177,8 +176,7 @@ class TestForwardScrub(CephFSTestCase):
         self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
 
         # Run cephfs-data-scan targeting only orphans
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
         self.fs.data_scan([
             "scan_inodes",
@@ -192,7 +190,7 @@ class TestForwardScrub(CephFSTestCase):
 
         # And we should have all the same linkage we started with,
         # and no lost+found, and no extra inodes!
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         self._validate_linkage(inos)
@@ -239,8 +237,7 @@ class TestForwardScrub(CephFSTestCase):
             self.assertEqual(out_json["return_code"], 0)
             self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 
-        self.mds_cluster.mds_stop()
-        self.mds_cluster.mds_fail()
+        self.fs.fail()
 
         # Truncate the journal (to ensure the inotable on disk
         # is all that will be in the InoTable in memory)
@@ -255,7 +252,7 @@ class TestForwardScrub(CephFSTestCase):
         for key, value in inotable_copy.items():
            self.fs.put_metadata_object_raw(key, value)
 
-        self.mds_cluster.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         with self.assert_cluster_log("inode table repaired"):
@@ -264,7 +261,7 @@ class TestForwardScrub(CephFSTestCase):
             self.assertEqual(out_json["return_code"], 0)
             self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
         table_text = self.fs.table_tool(["0", "show", "inode"])
         table = json.loads(table_text)
         self.assertGreater(
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py
index 6a295bbfdf1a..b8740136067a 100644
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -70,8 +70,7 @@ class TestMisc(CephFSTestCase):
 
         data_pool_name = self.fs.get_data_pool_name()
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                             '--yes-i-really-mean-it')
@@ -109,10 +108,11 @@ class TestMisc(CephFSTestCase):
                                             self.fs.metadata_pool_name,
                                             data_pool_name, "--force")
 
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
+
         self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                             '--yes-i-really-mean-it')
 
-
         self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
                                             self.fs.metadata_pool_name,
                                             self.fs.metadata_pool_name,
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py
index 2838f67b0dba..7658f0ab32aa 100644
--- a/qa/tasks/cephfs/test_recovery_pool.py
+++ b/qa/tasks/cephfs/test_recovery_pool.py
@@ -121,8 +121,8 @@ class TestRecoveryPool(CephFSTestCase):
         self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
 
         # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+        self.fs.fail()
 
         # After recovery, we need the MDS to not be strict about stats (in production these options
         # are off by default, but in QA we need to explicitly disable them)
@@ -146,7 +146,6 @@ class TestRecoveryPool(CephFSTestCase):
                 # Normal reset should fail when no objects are present, we'll use --force instead
                 self.fs.journal_tool(["journal", "reset"], 0)
 
-        self.fs.mds_stop()
         self.fs.data_scan(['scan_extents', '--alternate-pool',
                            recovery_pool, '--filesystem', self.fs.name,
                            self.fs.get_data_pool_name()])
@@ -174,6 +173,7 @@ class TestRecoveryPool(CephFSTestCase):
 
         # Start the MDS
         self.fs.mds_restart()
+        self.fs.set_joinable()
         self.recovery_fs.mds_restart()
         self.fs.wait_for_daemons()
         self.recovery_fs.wait_for_daemons()
diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py
index 01e9dc8f6f3f..37f0f98f71d1 100644
--- a/qa/tasks/cephfs/test_scrub.py
+++ b/qa/tasks/cephfs/test_scrub.py
@@ -12,8 +12,9 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
 
 
 class Workload(CephFSTestCase):
-    def __init__(self, filesystem, mount):
+    def __init__(self, test, filesystem, mount):
         super().__init__()
+        self._test =  test
         self._mount = mount
         self._filesystem = filesystem
         self._initial_state = None
@@ -94,13 +95,13 @@ class DupInodeWorkload(Workload):
         temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
         self._mount.umount_wait()
         self._filesystem.mds_asok(["flush", "journal"])
-        self._filesystem.mds_stop()
+        self._filesystem.fail()
         self._filesystem.rados(["getomapval", "10000000000.00000000",
                                 "parentfile_head", temp_bin_path])
         self._filesystem.rados(["setomapval", "10000000000.00000000",
                                 "shadow_head"], stdin_file=temp_bin_path)
-        self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
-        self._filesystem.mds_restart()
+        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
+        self._filesystem.set_joinable()
         self._filesystem.wait_for_daemons()
 
     def validate(self):
@@ -170,10 +171,10 @@ class TestScrub(CephFSTestCase):
         self.assertEqual(self._get_damage_count(), 0)
 
     def test_scrub_backtrace_for_new_files(self):
-        self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
 
     def test_scrub_backtrace(self):
-        self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
 
     def test_scrub_dup_inode(self):
-        self._scrub(DupInodeWorkload(self.fs, self.mount_a))
+        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))