From 84df4b3d0c9e767a74cf5af80e8138239992df2c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Fri, 11 Aug 2023 04:40:36 -0400 Subject: [PATCH] test: add test to verify that a buggy client is blocklisted ... when its session metadata is bloated due to buildup of `completed_requests`. Signed-off-by: Venky Shankar --- qa/tasks/cephfs/test_client_limits.py | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index 836f81af1642a..102f22f960cc4 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -9,6 +9,7 @@ from textwrap import dedent from tasks.ceph_test_case import TestTimeoutError from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError import os @@ -221,6 +222,56 @@ class TestClientLimits(CephFSTestCase): self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) rproc.wait() + def test_client_blocklisted_oldest_tid(self): + """ + that a client is blocklisted when its encoded session metadata exceeds the + configured threshold (due to ever growing `completed_requests` caused due + to an unidentified bug (in the client or the MDS)). + """ + + # num of requests client issues + max_requests = 10000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.config_set('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.config_set('mds', 'mds_max_completed_requests', max_requests); + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id)) + + # why a multiplier of 20, you may ask - I arrieved at this from some debugs + # that I put when testing the fix in a vstart cluster where its a ratio of + # encoded session information to the number of completed requests. + self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20); + + # Create a few more files synchronously. This would hit the session metadata threshold + # causing the client to get blocklisted. + with self.assertRaises(CommandFailedError): + self.mount_a.create_n_files("testdir/file2", 20, True) + + self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr()) + # the mds should bump up the relevant perf counter + pd = self.perf_dump() + self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0) + + # reset the config + self.config_set('client', 'client inject fixed oldest tid', 'false') + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + def test_client_oldest_tid(self): """ When a client does not advance its oldest tid, the MDS should notice that -- 2.39.5