]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: simplify tests which stop MDS ranks
authorPatrick Donnelly <pdonnell@redhat.com>
Thu, 11 Mar 2021 19:06:23 +0000 (11:06 -0800)
committerPatrick Donnelly <pdonnell@redhat.com>
Wed, 31 Mar 2021 14:29:54 +0000 (07:29 -0700)
Instead of stopping MDS daemons and individually failing MDS daemons,
just fail the ranks or the entire file system, where possible.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 0825d6aa9e1997d21b8a32f36c6c0544bf3ff40b)

qa/tasks/cephfs/cephfs_test_case.py
qa/tasks/cephfs/test_auto_repair.py
qa/tasks/cephfs/test_cap_flush.py
qa/tasks/cephfs/test_client_recovery.py
qa/tasks/cephfs/test_damage.py
qa/tasks/cephfs/test_data_scan.py
qa/tasks/cephfs/test_forward_scrub.py
qa/tasks/cephfs/test_misc.py
qa/tasks/cephfs/test_recovery_pool.py
qa/tasks/cephfs/test_scrub.py

index 57ddc48d5c3d80d747ab9917edd6090c8f0b699e..9ef7989d9468d67a6631c1be088a09e9f2be0a0c 100644 (file)
@@ -195,7 +195,6 @@ class CephFSTestCase(CephTestCase):
             self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
             self.recovery_fs.create()
             self.recovery_fs.getinfo(refresh=True)
-            self.recovery_fs.mds_restart()
             self.recovery_fs.wait_for_daemons()
 
         # Load an config settings of interest
index 141be9883d02225d2c497338ff787c9abbdc2a24..63a98c830dae416b69f4d3a7025a24ad57e7d602 100644 (file)
@@ -36,8 +36,7 @@ class TestMDSAutoRepair(CephFSTestCase):
 
         # Restart the MDS to drop the metadata cache (because we expired the journal,
         # nothing gets replayed into cache on restart)
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
         self.fs.wait_for_daemons()
 
         # remove testdir1's backtrace
index 0b4910ccb477209ecc8b2868e586f4c2495611ba..2fc9410d13a77bb12d5fffe01ab6c6515eae5e55 100644 (file)
@@ -49,8 +49,7 @@ class TestCapFlush(CephFSTestCase):
         time.sleep(10)
 
         # Restart mds. Client will re-send the unsafe request and cap flush
-        self.fs.mds_stop()
-        self.fs.mds_fail_restart()
+        self.fs.rank_fail()
         self.fs.wait_for_daemons()
 
         mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
index ab2fe239c1c6a5d2717e8e89b5f51b35b30099f0..3ae208a69925b6706c7cee264238af471dd6b321 100644 (file)
@@ -135,13 +135,12 @@ class TestClientRecovery(CephFSTestCase):
         # =================
         # Check that if I stop an MDS and a client goes away, the MDS waits
         # for the reconnect period
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         mount_a_client_id = self.mount_a.get_global_id()
         self.mount_a.umount_wait(force=True)
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
         # Check that the MDS locally reports its state correctly
@@ -178,8 +177,7 @@ class TestClientRecovery(CephFSTestCase):
         # =========================
         mount_a_client_id = self.mount_a.get_global_id()
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # The mount goes away while the MDS is offline
         self.mount_a.kill()
@@ -187,7 +185,7 @@ class TestClientRecovery(CephFSTestCase):
         # wait for it to die
         time.sleep(5)
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         # Enter reconnect phase
         self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
@@ -468,13 +466,12 @@ class TestClientRecovery(CephFSTestCase):
         )
 
         # Immediately kill the MDS and then client A
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.mount_a.kill()
         self.mount_a.kill_cleanup()
 
         # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         log.info("Waiting for reconnect...")
         self.fs.wait_for_state("up:reconnect")
         log.info("Waiting for active...")
index c6067dc320620f031c29cf994354f1872e954207..7e95032a0dc7d41d41813a174a591f2157f51a8e 100644 (file)
@@ -51,8 +51,7 @@ class TestDamage(CephFSTestCase):
         for mds_name in self.fs.get_active_names():
             self.fs.mds_asok(["flush", "journal"], mds_name)
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         self.fs.rados(['export', '/tmp/metadata.bin'])
 
@@ -242,8 +241,7 @@ class TestDamage(CephFSTestCase):
 
             # Reset MDS state
             self.mount_a.umount_wait(force=True)
-            self.fs.mds_stop()
-            self.fs.mds_fail()
+            self.fs.fail()
             self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
 
             # Reset RADOS pool state
@@ -253,7 +251,7 @@ class TestDamage(CephFSTestCase):
             mutation.mutate_fn()
 
             # Try starting the MDS
-            self.fs.mds_restart()
+            self.fs.set_joinable()
 
             # How long we'll wait between starting a daemon and expecting
             # it to make it through startup, and potentially declare itself
@@ -388,8 +386,7 @@ class TestDamage(CephFSTestCase):
         for mds_name in self.fs.get_active_names():
             self.fs.mds_asok(["flush", "journal"], mds_name)
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Corrupt a dentry
         junk = "deadbeef" * 10
@@ -397,7 +394,7 @@ class TestDamage(CephFSTestCase):
         self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
 
         # Start up and try to list it
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
@@ -497,9 +494,9 @@ class TestDamage(CephFSTestCase):
         self.fs.mds_asok(["flush", "journal"])
 
         # Drop everything from the MDS cache
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
         self.fs.journal_tool(['journal', 'reset'], 0)
-        self.mds_cluster.mds_fail_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
index 0f0a352a2ac27a3461e011a9361f22587106dff5..d3d11c6b77e53503e586cf9944cae06b89a35a15 100644 (file)
@@ -328,8 +328,7 @@ class TestDataScan(CephFSTestCase):
         workload.flush()
 
         # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # After recovery, we need the MDS to not be strict about stats (in production these options
         # are off by default, but in QA we need to explicitly disable them)
@@ -343,7 +342,7 @@ class TestDataScan(CephFSTestCase):
         # only understands how to rebuild metadata under rank 0
         self.fs.reset()
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
 
         def get_state(mds_id):
             info = self.mds_cluster.get_mds_info(mds_id)
@@ -458,8 +457,7 @@ class TestDataScan(CephFSTestCase):
         # Flush journal and stop MDS
         self.mount_a.umount_wait()
         self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Pick a dentry and wipe out its key
         # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
@@ -471,7 +469,7 @@ class TestDataScan(CephFSTestCase):
         self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
 
         # Start filesystem back up, observe that the file appears to be gone in an `ls`
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
@@ -479,8 +477,7 @@ class TestDataScan(CephFSTestCase):
 
         # Stop the filesystem
         self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # Run data-scan, observe that it inserts our dentry back into the correct fragment
         # by checking the omap now has the dentry's key again
@@ -491,7 +488,7 @@ class TestDataScan(CephFSTestCase):
 
         # Start the filesystem and check that the dentry we deleted is now once again visible
         # and points to the correct file data.
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
@@ -593,8 +590,7 @@ class TestDataScan(CephFSTestCase):
         self.mount_a.umount_wait()
 
         self.fs.mds_asok(["flush", "journal"], mds_id)
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         # repair linkage errors
         self.fs.data_scan(["scan_links"])
@@ -602,7 +598,7 @@ class TestDataScan(CephFSTestCase):
         # primary link in testdir2 was deleted?
         self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         self.mount_a.mount_wait()
@@ -641,7 +637,7 @@ class TestDataScan(CephFSTestCase):
 
         self.fs.mds_asok(["flush", "journal"], mds0_id)
         self.fs.mds_asok(["flush", "journal"], mds1_id)
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
 
         self.fs.rados(["rm", "mds0_inotable"])
         self.fs.rados(["rm", "mds1_inotable"])
index 59ca24845095b0f5df0e7ae36dc3c4542393e078..69b799fc082f0db4b304f243bbd4fba0336ca964 100644 (file)
@@ -146,14 +146,13 @@ class TestForwardScrub(CephFSTestCase):
         # Orphan an inode by deleting its dentry
         # Our victim will be.... bravo.
         self.mount_a.umount_wait()
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
         self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
         frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
         self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
 
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         # See that the orphaned file is indeed missing from a client's POV
@@ -177,8 +176,7 @@ class TestForwardScrub(CephFSTestCase):
         self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
 
         # Run cephfs-data-scan targeting only orphans
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
         self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
         self.fs.data_scan([
             "scan_inodes",
@@ -192,7 +190,7 @@ class TestForwardScrub(CephFSTestCase):
 
         # And we should have all the same linkage we started with,
         # and no lost+found, and no extra inodes!
-        self.fs.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
         self.mount_a.mount_wait()
         self._validate_linkage(inos)
@@ -239,8 +237,7 @@ class TestForwardScrub(CephFSTestCase):
             self.assertEqual(out_json["return_code"], 0)
             self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 
-        self.mds_cluster.mds_stop()
-        self.mds_cluster.mds_fail()
+        self.fs.fail()
 
         # Truncate the journal (to ensure the inotable on disk
         # is all that will be in the InoTable in memory)
@@ -255,7 +252,7 @@ class TestForwardScrub(CephFSTestCase):
         for key, value in inotable_copy.items():
            self.fs.put_metadata_object_raw(key, value)
 
-        self.mds_cluster.mds_restart()
+        self.fs.set_joinable()
         self.fs.wait_for_daemons()
 
         with self.assert_cluster_log("inode table repaired"):
@@ -264,7 +261,7 @@ class TestForwardScrub(CephFSTestCase):
             self.assertEqual(out_json["return_code"], 0)
             self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
 
-        self.mds_cluster.mds_stop()
+        self.fs.fail()
         table_text = self.fs.table_tool(["0", "show", "inode"])
         table = json.loads(table_text)
         self.assertGreater(
index 6a295bbfdf1aca7dcf442385b994c1e3e4e69c6e..b8740136067aa84d817f22dea4dcad20bc4edeb2 100644 (file)
@@ -70,8 +70,7 @@ class TestMisc(CephFSTestCase):
 
         data_pool_name = self.fs.get_data_pool_name()
 
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.fail()
 
         self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                             '--yes-i-really-mean-it')
@@ -109,10 +108,11 @@ class TestMisc(CephFSTestCase):
                                             self.fs.metadata_pool_name,
                                             data_pool_name, "--force")
 
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
+
         self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
                                             '--yes-i-really-mean-it')
 
-
         self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
                                             self.fs.metadata_pool_name,
                                             self.fs.metadata_pool_name,
index 2838f67b0dba3c19959835bcb6950d053f06b4ec..7658f0ab32aaaf05afefb845f4f2f9b38dade2d0 100644 (file)
@@ -121,8 +121,8 @@ class TestRecoveryPool(CephFSTestCase):
         self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
 
         # Stop the MDS
-        self.fs.mds_stop()
-        self.fs.mds_fail()
+        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+        self.fs.fail()
 
         # After recovery, we need the MDS to not be strict about stats (in production these options
         # are off by default, but in QA we need to explicitly disable them)
@@ -146,7 +146,6 @@ class TestRecoveryPool(CephFSTestCase):
                 # Normal reset should fail when no objects are present, we'll use --force instead
                 self.fs.journal_tool(["journal", "reset"], 0)
 
-        self.fs.mds_stop()
         self.fs.data_scan(['scan_extents', '--alternate-pool',
                            recovery_pool, '--filesystem', self.fs.name,
                            self.fs.get_data_pool_name()])
@@ -174,6 +173,7 @@ class TestRecoveryPool(CephFSTestCase):
 
         # Start the MDS
         self.fs.mds_restart()
+        self.fs.set_joinable()
         self.recovery_fs.mds_restart()
         self.fs.wait_for_daemons()
         self.recovery_fs.wait_for_daemons()
index 01e9dc8f6f3f8ce71e6c55b30d0a4e8bcdba7ead..37f0f98f71d1811825146c21072ec92d129684fd 100644 (file)
@@ -12,8 +12,9 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
 
 
 class Workload(CephFSTestCase):
-    def __init__(self, filesystem, mount):
+    def __init__(self, test, filesystem, mount):
         super().__init__()
+        self._test =  test
         self._mount = mount
         self._filesystem = filesystem
         self._initial_state = None
@@ -94,13 +95,13 @@ class DupInodeWorkload(Workload):
         temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
         self._mount.umount_wait()
         self._filesystem.mds_asok(["flush", "journal"])
-        self._filesystem.mds_stop()
+        self._filesystem.fail()
         self._filesystem.rados(["getomapval", "10000000000.00000000",
                                 "parentfile_head", temp_bin_path])
         self._filesystem.rados(["setomapval", "10000000000.00000000",
                                 "shadow_head"], stdin_file=temp_bin_path)
-        self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
-        self._filesystem.mds_restart()
+        self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
+        self._filesystem.set_joinable()
         self._filesystem.wait_for_daemons()
 
     def validate(self):
@@ -170,10 +171,10 @@ class TestScrub(CephFSTestCase):
         self.assertEqual(self._get_damage_count(), 0)
 
     def test_scrub_backtrace_for_new_files(self):
-        self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
 
     def test_scrub_backtrace(self):
-        self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+        self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
 
     def test_scrub_dup_inode(self):
-        self._scrub(DupInodeWorkload(self.fs, self.mount_a))
+        self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))