From 2142114a2da5ac329ab9608405231ba1aa870206 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 30 Jan 2023 14:33:32 -0500 Subject: [PATCH] qa: add numerous subtree test When the ESubtreeMap is very large (~5k+ subtrees), the MDS will end up logging only a few events (as bad as 1) per segment as the subtree map dominates the segment size. This test simply creates an artificially large subtree and confirms that other file system activity completes in a timely manner. This is now taking advantage of the minor segments which allows for a normal set of events per log segment (and fewer subtree maps). The test fails on the current main HEAD. Historical note: when I first observed this abberant behavior, the vstart cluster was actually using mds_debug_subtrees = True (the default for every vstart cluster). This caused the MDS to write out the subtree map (for debugging reasons) with every event. When testing the MDS with large subtrees (distributed ephemeral pinning), this caused the MDS to slow to a trickle of operations per second. Despite this unintentional misconfiguration, the problem still exists but the number of auth subtrees must be large for a particlar rank to replicate the behavior. On main HEAD, the creation of 10k files (workload stage) takes ~110 seconds. On this branch, it takes ~30 seconds. Signed-off-by: Patrick Donnelly --- qa/tasks/cephfs/test_exports.py | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py index 4b7e884ec33..2074d3da790 100644 --- a/qa/tasks/cephfs/test_exports.py +++ b/qa/tasks/cephfs/test_exports.py @@ -141,6 +141,53 @@ class TestExportPin(CephFSTestCase): self.assertEqual(self.mount_a.getfattr("1", "ceph.dir.pin"), '1') self.assertEqual(self.mount_a.getfattr("1/2", "ceph.dir.pin"), '0') + def test_export_pin_many(self): + """ + That large numbers of export pins don't slow down the MDS in unexpected ways. + """ + + def getlrg(): + return self.fs.rank_asok(['perf', 'dump', 'mds_log'])['mds_log']['evlrg'] + + # vstart.sh sets mds_debug_subtrees to True. That causes a ESubtreeMap + # to be written out every event. Yuck! + self.config_set('mds', 'mds_debug_subtrees', False) + self.mount_a.run_shell_payload("rm -rf 1") + + # flush everything out so ESubtreeMap is the only event in the log + self.fs.rank_asok(["flush", "journal"], rank=0) + lrg = getlrg() + + n = 5000 + self.mount_a.run_shell_payload(f""" +mkdir top +setfattr -n ceph.dir.pin -v 1 top +for i in `seq 0 {n-1}`; do + path=$(printf top/%08d $i) + mkdir "$path" + touch "$path/file" + setfattr -n ceph.dir.pin -v 0 "$path" +done +""") + + subtrees = [] + subtrees.append(('/top', 1)) + for i in range(0, n): + subtrees.append((f"/top/{i:08}", 0)) + self._wait_subtrees(subtrees, status=self.status, timeout=300, rank=1) + + self.assertGreater(getlrg(), lrg) + + # flush everything out so ESubtreeMap is the only event in the log + self.fs.rank_asok(["flush", "journal"], rank=0) + + # now do some trivial work on rank 0, verify journaling is not slowed down by thousands of subtrees + start = time.time() + lrg = getlrg() + self.mount_a.run_shell_payload('cd top/00000000 && for i in `seq 1 10000`; do mkdir $i; done;') + self.assertLessEqual(getlrg()-1, lrg) # at most one ESubtree separating events + self.assertLess(time.time()-start, 120) + def test_export_pin_cache_drop(self): """ That the export pin does not prevent empty (nothing in cache) subtree merging. -- 2.39.5