From: Patrick Donnelly Date: Thu, 11 Mar 2021 19:06:23 +0000 (-0800) Subject: qa: simplify tests which stop MDS ranks X-Git-Tag: v16.2.2~8^2~5^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d7f973393cfb39ca919538aef048280ba4a0d8a5;p=ceph.git qa: simplify tests which stop MDS ranks Instead of stopping MDS daemons and individually failing MDS daemons, just fail the ranks or the entire file system, where possible. Signed-off-by: Patrick Donnelly (cherry picked from commit 0825d6aa9e1997d21b8a32f36c6c0544bf3ff40b) --- diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py index 57ddc48d5c3d..9ef7989d9468 100644 --- a/qa/tasks/cephfs/cephfs_test_case.py +++ b/qa/tasks/cephfs/cephfs_test_case.py @@ -195,7 +195,6 @@ class CephFSTestCase(CephTestCase): self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) self.recovery_fs.create() self.recovery_fs.getinfo(refresh=True) - self.recovery_fs.mds_restart() self.recovery_fs.wait_for_daemons() # Load an config settings of interest diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py index 141be9883d02..63a98c830dae 100644 --- a/qa/tasks/cephfs/test_auto_repair.py +++ b/qa/tasks/cephfs/test_auto_repair.py @@ -36,8 +36,7 @@ class TestMDSAutoRepair(CephFSTestCase): # Restart the MDS to drop the metadata cache (because we expired the journal, # nothing gets replayed into cache on restart) - self.fs.mds_stop() - self.fs.mds_fail_restart() + self.fs.rank_fail() self.fs.wait_for_daemons() # remove testdir1's backtrace diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py index 0b4910ccb477..2fc9410d13a7 100644 --- a/qa/tasks/cephfs/test_cap_flush.py +++ b/qa/tasks/cephfs/test_cap_flush.py @@ -49,8 +49,7 @@ class TestCapFlush(CephFSTestCase): time.sleep(10) # Restart mds. Client will re-send the unsafe request and cap flush - self.fs.mds_stop() - self.fs.mds_fail_restart() + self.fs.rank_fail() self.fs.wait_for_daemons() mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip() diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py index ab2fe239c1c6..3ae208a69925 100644 --- a/qa/tasks/cephfs/test_client_recovery.py +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -135,13 +135,12 @@ class TestClientRecovery(CephFSTestCase): # ================= # Check that if I stop an MDS and a client goes away, the MDS waits # for the reconnect period - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() mount_a_client_id = self.mount_a.get_global_id() self.mount_a.umount_wait(force=True) - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) # Check that the MDS locally reports its state correctly @@ -178,8 +177,7 @@ class TestClientRecovery(CephFSTestCase): # ========================= mount_a_client_id = self.mount_a.get_global_id() - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # The mount goes away while the MDS is offline self.mount_a.kill() @@ -187,7 +185,7 @@ class TestClientRecovery(CephFSTestCase): # wait for it to die time.sleep(5) - self.fs.mds_restart() + self.fs.set_joinable() # Enter reconnect phase self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) @@ -468,13 +466,12 @@ class TestClientRecovery(CephFSTestCase): ) # Immediately kill the MDS and then client A - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.mount_a.kill() self.mount_a.kill_cleanup() # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay - self.fs.mds_restart() + self.fs.set_joinable() log.info("Waiting for reconnect...") self.fs.wait_for_state("up:reconnect") log.info("Waiting for active...") diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py index c6067dc32062..7e95032a0dc7 100644 --- a/qa/tasks/cephfs/test_damage.py +++ b/qa/tasks/cephfs/test_damage.py @@ -51,8 +51,7 @@ class TestDamage(CephFSTestCase): for mds_name in self.fs.get_active_names(): self.fs.mds_asok(["flush", "journal"], mds_name) - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.fs.rados(['export', '/tmp/metadata.bin']) @@ -242,8 +241,7 @@ class TestDamage(CephFSTestCase): # Reset MDS state self.mount_a.umount_wait(force=True) - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') # Reset RADOS pool state @@ -253,7 +251,7 @@ class TestDamage(CephFSTestCase): mutation.mutate_fn() # Try starting the MDS - self.fs.mds_restart() + self.fs.set_joinable() # How long we'll wait between starting a daemon and expecting # it to make it through startup, and potentially declare itself @@ -388,8 +386,7 @@ class TestDamage(CephFSTestCase): for mds_name in self.fs.get_active_names(): self.fs.mds_asok(["flush", "journal"], mds_name) - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # Corrupt a dentry junk = "deadbeef" * 10 @@ -397,7 +394,7 @@ class TestDamage(CephFSTestCase): self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) # Start up and try to list it - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() @@ -497,9 +494,9 @@ class TestDamage(CephFSTestCase): self.fs.mds_asok(["flush", "journal"]) # Drop everything from the MDS cache - self.mds_cluster.mds_stop() + self.fs.fail() self.fs.journal_tool(['journal', 'reset'], 0) - self.mds_cluster.mds_fail_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py index 0f0a352a2ac2..d3d11c6b77e5 100644 --- a/qa/tasks/cephfs/test_data_scan.py +++ b/qa/tasks/cephfs/test_data_scan.py @@ -328,8 +328,7 @@ class TestDataScan(CephFSTestCase): workload.flush() # Stop the MDS - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # After recovery, we need the MDS to not be strict about stats (in production these options # are off by default, but in QA we need to explicitly disable them) @@ -343,7 +342,7 @@ class TestDataScan(CephFSTestCase): # only understands how to rebuild metadata under rank 0 self.fs.reset() - self.fs.mds_restart() + self.fs.set_joinable() def get_state(mds_id): info = self.mds_cluster.get_mds_info(mds_id) @@ -458,8 +457,7 @@ class TestDataScan(CephFSTestCase): # Flush journal and stop MDS self.mount_a.umount_wait() self.fs.mds_asok(["flush", "journal"], mds_id) - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # Pick a dentry and wipe out its key # Because I did a 1 bit split, I know one frag will be named .01000000 @@ -471,7 +469,7 @@ class TestDataScan(CephFSTestCase): self.fs.rados(["rmomapkey", frag_obj_id, victim_key]) # Start filesystem back up, observe that the file appears to be gone in an `ls` - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") @@ -479,8 +477,7 @@ class TestDataScan(CephFSTestCase): # Stop the filesystem self.mount_a.umount_wait() - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # Run data-scan, observe that it inserts our dentry back into the correct fragment # by checking the omap now has the dentry's key again @@ -491,7 +488,7 @@ class TestDataScan(CephFSTestCase): # Start the filesystem and check that the dentry we deleted is now once again visible # and points to the correct file data. - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip() @@ -593,8 +590,7 @@ class TestDataScan(CephFSTestCase): self.mount_a.umount_wait() self.fs.mds_asok(["flush", "journal"], mds_id) - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() # repair linkage errors self.fs.data_scan(["scan_links"]) @@ -602,7 +598,7 @@ class TestDataScan(CephFSTestCase): # primary link in testdir2 was deleted? self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() @@ -641,7 +637,7 @@ class TestDataScan(CephFSTestCase): self.fs.mds_asok(["flush", "journal"], mds0_id) self.fs.mds_asok(["flush", "journal"], mds1_id) - self.mds_cluster.mds_stop() + self.fs.fail() self.fs.rados(["rm", "mds0_inotable"]) self.fs.rados(["rm", "mds1_inotable"]) diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py index 59ca24845095..69b799fc082f 100644 --- a/qa/tasks/cephfs/test_forward_scrub.py +++ b/qa/tasks/cephfs/test_forward_scrub.py @@ -146,14 +146,13 @@ class TestForwardScrub(CephFSTestCase): # Orphan an inode by deleting its dentry # Our victim will be.... bravo. self.mount_a.umount_wait() - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.fs.set_ceph_conf('mds', 'mds verify scatter', False) self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"]) self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"]) - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() # See that the orphaned file is indeed missing from a client's POV @@ -177,8 +176,7 @@ class TestForwardScrub(CephFSTestCase): self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name()) # Run cephfs-data-scan targeting only orphans - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) self.fs.data_scan([ "scan_inodes", @@ -192,7 +190,7 @@ class TestForwardScrub(CephFSTestCase): # And we should have all the same linkage we started with, # and no lost+found, and no extra inodes! - self.fs.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() self.mount_a.mount_wait() self._validate_linkage(inos) @@ -239,8 +237,7 @@ class TestForwardScrub(CephFSTestCase): self.assertEqual(out_json["return_code"], 0) self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) - self.mds_cluster.mds_stop() - self.mds_cluster.mds_fail() + self.fs.fail() # Truncate the journal (to ensure the inotable on disk # is all that will be in the InoTable in memory) @@ -255,7 +252,7 @@ class TestForwardScrub(CephFSTestCase): for key, value in inotable_copy.items(): self.fs.put_metadata_object_raw(key, value) - self.mds_cluster.mds_restart() + self.fs.set_joinable() self.fs.wait_for_daemons() with self.assert_cluster_log("inode table repaired"): @@ -264,7 +261,7 @@ class TestForwardScrub(CephFSTestCase): self.assertEqual(out_json["return_code"], 0) self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) - self.mds_cluster.mds_stop() + self.fs.fail() table_text = self.fs.table_tool(["0", "show", "inode"]) table = json.loads(table_text) self.assertGreater( diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py index 6a295bbfdf1a..b8740136067a 100644 --- a/qa/tasks/cephfs/test_misc.py +++ b/qa/tasks/cephfs/test_misc.py @@ -70,8 +70,7 @@ class TestMisc(CephFSTestCase): data_pool_name = self.fs.get_data_pool_name() - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.fail() self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, '--yes-i-really-mean-it') @@ -109,10 +108,11 @@ class TestMisc(CephFSTestCase): self.fs.metadata_pool_name, data_pool_name, "--force") + self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name) + self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, '--yes-i-really-mean-it') - self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', self.fs.metadata_pool_name, self.fs.metadata_pool_name, diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py index 2838f67b0dba..7658f0ab32aa 100644 --- a/qa/tasks/cephfs/test_recovery_pool.py +++ b/qa/tasks/cephfs/test_recovery_pool.py @@ -121,8 +121,8 @@ class TestRecoveryPool(CephFSTestCase): self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"]) # Stop the MDS - self.fs.mds_stop() - self.fs.mds_fail() + self.fs.mds_stop() # otherwise MDS will join once the fs is reset + self.fs.fail() # After recovery, we need the MDS to not be strict about stats (in production these options # are off by default, but in QA we need to explicitly disable them) @@ -146,7 +146,6 @@ class TestRecoveryPool(CephFSTestCase): # Normal reset should fail when no objects are present, we'll use --force instead self.fs.journal_tool(["journal", "reset"], 0) - self.fs.mds_stop() self.fs.data_scan(['scan_extents', '--alternate-pool', recovery_pool, '--filesystem', self.fs.name, self.fs.get_data_pool_name()]) @@ -174,6 +173,7 @@ class TestRecoveryPool(CephFSTestCase): # Start the MDS self.fs.mds_restart() + self.fs.set_joinable() self.recovery_fs.mds_restart() self.fs.wait_for_daemons() self.recovery_fs.wait_for_daemons() diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py index 01e9dc8f6f3f..37f0f98f71d1 100644 --- a/qa/tasks/cephfs/test_scrub.py +++ b/qa/tasks/cephfs/test_scrub.py @@ -12,8 +12,9 @@ ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) class Workload(CephFSTestCase): - def __init__(self, filesystem, mount): + def __init__(self, test, filesystem, mount): super().__init__() + self._test = test self._mount = mount self._filesystem = filesystem self._initial_state = None @@ -94,13 +95,13 @@ class DupInodeWorkload(Workload): temp_bin_path = "/tmp/10000000000.00000000_omap.bin" self._mount.umount_wait() self._filesystem.mds_asok(["flush", "journal"]) - self._filesystem.mds_stop() + self._filesystem.fail() self._filesystem.rados(["getomapval", "10000000000.00000000", "parentfile_head", temp_bin_path]) self._filesystem.rados(["setomapval", "10000000000.00000000", "shadow_head"], stdin_file=temp_bin_path) - self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True) - self._filesystem.mds_restart() + self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True) + self._filesystem.set_joinable() self._filesystem.wait_for_daemons() def validate(self): @@ -170,10 +171,10 @@ class TestScrub(CephFSTestCase): self.assertEqual(self._get_damage_count(), 0) def test_scrub_backtrace_for_new_files(self): - self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a)) + self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a)) def test_scrub_backtrace(self): - self._scrub(BacktraceWorkload(self.fs, self.mount_a)) + self._scrub(BacktraceWorkload(self, self.fs, self.mount_a)) def test_scrub_dup_inode(self): - self._scrub(DupInodeWorkload(self.fs, self.mount_a)) + self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))