self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
self.recovery_fs.create()
self.recovery_fs.getinfo(refresh=True)
- self.recovery_fs.mds_restart()
self.recovery_fs.wait_for_daemons()
# Load an config settings of interest
# Restart the MDS to drop the metadata cache (because we expired the journal,
# nothing gets replayed into cache on restart)
- self.fs.mds_stop()
- self.fs.mds_fail_restart()
+ self.fs.rank_fail()
self.fs.wait_for_daemons()
# remove testdir1's backtrace
time.sleep(10)
# Restart mds. Client will re-send the unsafe request and cap flush
- self.fs.mds_stop()
- self.fs.mds_fail_restart()
+ self.fs.rank_fail()
self.fs.wait_for_daemons()
mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
# =================
# Check that if I stop an MDS and a client goes away, the MDS waits
# for the reconnect period
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
mount_a_client_id = self.mount_a.get_global_id()
self.mount_a.umount_wait(force=True)
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
# Check that the MDS locally reports its state correctly
# =========================
mount_a_client_id = self.mount_a.get_global_id()
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# The mount goes away while the MDS is offline
self.mount_a.kill()
# wait for it to die
time.sleep(5)
- self.fs.mds_restart()
+ self.fs.set_joinable()
# Enter reconnect phase
self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
)
# Immediately kill the MDS and then client A
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.mount_a.kill()
self.mount_a.kill_cleanup()
# Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay
- self.fs.mds_restart()
+ self.fs.set_joinable()
log.info("Waiting for reconnect...")
self.fs.wait_for_state("up:reconnect")
log.info("Waiting for active...")
for mds_name in self.fs.get_active_names():
self.fs.mds_asok(["flush", "journal"], mds_name)
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.fs.rados(['export', '/tmp/metadata.bin'])
# Reset MDS state
self.mount_a.umount_wait(force=True)
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
# Reset RADOS pool state
mutation.mutate_fn()
# Try starting the MDS
- self.fs.mds_restart()
+ self.fs.set_joinable()
# How long we'll wait between starting a daemon and expecting
# it to make it through startup, and potentially declare itself
for mds_name in self.fs.get_active_names():
self.fs.mds_asok(["flush", "journal"], mds_name)
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# Corrupt a dentry
junk = "deadbeef" * 10
self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
# Start up and try to list it
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
self.fs.mds_asok(["flush", "journal"])
# Drop everything from the MDS cache
- self.mds_cluster.mds_stop()
+ self.fs.fail()
self.fs.journal_tool(['journal', 'reset'], 0)
- self.mds_cluster.mds_fail_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
workload.flush()
# Stop the MDS
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# After recovery, we need the MDS to not be strict about stats (in production these options
# are off by default, but in QA we need to explicitly disable them)
# only understands how to rebuild metadata under rank 0
self.fs.reset()
- self.fs.mds_restart()
+ self.fs.set_joinable()
def get_state(mds_id):
info = self.mds_cluster.get_mds_info(mds_id)
# Flush journal and stop MDS
self.mount_a.umount_wait()
self.fs.mds_asok(["flush", "journal"], mds_id)
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# Pick a dentry and wipe out its key
# Because I did a 1 bit split, I know one frag will be named <inode>.01000000
self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
# Start filesystem back up, observe that the file appears to be gone in an `ls`
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
# Stop the filesystem
self.mount_a.umount_wait()
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# Run data-scan, observe that it inserts our dentry back into the correct fragment
# by checking the omap now has the dentry's key again
# Start the filesystem and check that the dentry we deleted is now once again visible
# and points to the correct file data.
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
self.mount_a.umount_wait()
self.fs.mds_asok(["flush", "journal"], mds_id)
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
# repair linkage errors
self.fs.data_scan(["scan_links"])
# primary link in testdir2 was deleted?
self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
self.fs.mds_asok(["flush", "journal"], mds0_id)
self.fs.mds_asok(["flush", "journal"], mds1_id)
- self.mds_cluster.mds_stop()
+ self.fs.fail()
self.fs.rados(["rm", "mds0_inotable"])
self.fs.rados(["rm", "mds1_inotable"])
# Orphan an inode by deleting its dentry
# Our victim will be.... bravo.
self.mount_a.umount_wait()
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
# See that the orphaned file is indeed missing from a client's POV
self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
# Run cephfs-data-scan targeting only orphans
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
self.fs.data_scan([
"scan_inodes",
# And we should have all the same linkage we started with,
# and no lost+found, and no extra inodes!
- self.fs.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
self.mount_a.mount_wait()
self._validate_linkage(inos)
self.assertEqual(out_json["return_code"], 0)
self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
- self.mds_cluster.mds_stop()
- self.mds_cluster.mds_fail()
+ self.fs.fail()
# Truncate the journal (to ensure the inotable on disk
# is all that will be in the InoTable in memory)
for key, value in inotable_copy.items():
self.fs.put_metadata_object_raw(key, value)
- self.mds_cluster.mds_restart()
+ self.fs.set_joinable()
self.fs.wait_for_daemons()
with self.assert_cluster_log("inode table repaired"):
self.assertEqual(out_json["return_code"], 0)
self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True)
- self.mds_cluster.mds_stop()
+ self.fs.fail()
table_text = self.fs.table_tool(["0", "show", "inode"])
table = json.loads(table_text)
self.assertGreater(
data_pool_name = self.fs.get_data_pool_name()
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.fail()
self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
'--yes-i-really-mean-it')
self.fs.metadata_pool_name,
data_pool_name, "--force")
+ self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
+
self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
'--yes-i-really-mean-it')
-
self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
self.fs.metadata_pool_name,
self.fs.metadata_pool_name,
self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
# Stop the MDS
- self.fs.mds_stop()
- self.fs.mds_fail()
+ self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+ self.fs.fail()
# After recovery, we need the MDS to not be strict about stats (in production these options
# are off by default, but in QA we need to explicitly disable them)
# Normal reset should fail when no objects are present, we'll use --force instead
self.fs.journal_tool(["journal", "reset"], 0)
- self.fs.mds_stop()
self.fs.data_scan(['scan_extents', '--alternate-pool',
recovery_pool, '--filesystem', self.fs.name,
self.fs.get_data_pool_name()])
# Start the MDS
self.fs.mds_restart()
+ self.fs.set_joinable()
self.recovery_fs.mds_restart()
self.fs.wait_for_daemons()
self.recovery_fs.wait_for_daemons()
class Workload(CephFSTestCase):
- def __init__(self, filesystem, mount):
+ def __init__(self, test, filesystem, mount):
super().__init__()
+ self._test = test
self._mount = mount
self._filesystem = filesystem
self._initial_state = None
temp_bin_path = "/tmp/10000000000.00000000_omap.bin"
self._mount.umount_wait()
self._filesystem.mds_asok(["flush", "journal"])
- self._filesystem.mds_stop()
+ self._filesystem.fail()
self._filesystem.rados(["getomapval", "10000000000.00000000",
"parentfile_head", temp_bin_path])
self._filesystem.rados(["setomapval", "10000000000.00000000",
"shadow_head"], stdin_file=temp_bin_path)
- self._filesystem.set_ceph_conf('mds', 'mds hack allow loading invalid metadata', True)
- self._filesystem.mds_restart()
+ self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True)
+ self._filesystem.set_joinable()
self._filesystem.wait_for_daemons()
def validate(self):
self.assertEqual(self._get_damage_count(), 0)
def test_scrub_backtrace_for_new_files(self):
- self._scrub_new_files(BacktraceWorkload(self.fs, self.mount_a))
+ self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a))
def test_scrub_backtrace(self):
- self._scrub(BacktraceWorkload(self.fs, self.mount_a))
+ self._scrub(BacktraceWorkload(self, self.fs, self.mount_a))
def test_scrub_dup_inode(self):
- self._scrub(DupInodeWorkload(self.fs, self.mount_a))
+ self._scrub(DupInodeWorkload(self, self.fs, self.mount_a))