]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
test: add test to verify that a buggy client is blocklisted
authorVenky Shankar <vshankar@redhat.com>
Fri, 11 Aug 2023 08:40:36 +0000 (04:40 -0400)
committerVenky Shankar <vshankar@redhat.com>
Tue, 22 Aug 2023 04:47:27 +0000 (00:47 -0400)
... when its session metadata is bloated due to buildup of
`completed_requests`.

Signed-off-by: Venky Shankar <vshankar@redhat.com>
qa/tasks/cephfs/test_client_limits.py

index 836f81af1642aada64271d26d6231007d3dc7f47..102f22f960cc4d2835c66c488d83a9567ae0b194 100644 (file)
@@ -9,6 +9,7 @@ from textwrap import dedent
 from tasks.ceph_test_case import TestTimeoutError
 from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
 from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
 import os
 
 
@@ -221,6 +222,56 @@ class TestClientLimits(CephFSTestCase):
         self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
         rproc.wait()
 
+    def test_client_blocklisted_oldest_tid(self):
+        """
+        that a client is blocklisted when its encoded session metadata exceeds the
+        configured threshold (due to ever growing `completed_requests` caused due
+        to an unidentified bug (in the client or the MDS)).
+        """
+
+        # num of requests client issues
+        max_requests = 10000
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Require FUSE client to inject client release failure")
+
+        self.config_set('client', 'client inject fixed oldest tid', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount_wait()
+
+        self.config_set('mds', 'mds_max_completed_requests', max_requests);
+
+        # Create lots of files
+        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+        # Create a few files synchronously. This makes sure previous requests are completed
+        self.mount_a.create_n_files("testdir/file2", 5, True)
+
+        # Wait for the health warnings. Assume mds can handle 10 request per second at least
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
+
+        # why a multiplier of 20, you may ask - I arrieved at this from some debugs
+        # that I put when testing the fix in a vstart cluster where its a ratio of
+        # encoded session information to the number of completed requests.
+        self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20);
+
+        # Create a few more files synchronously. This would hit the session metadata threshold
+        # causing the client to get blocklisted.
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.create_n_files("testdir/file2", 20, True)
+
+        self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
+        # the mds should bump up the relevant perf counter
+        pd = self.perf_dump()
+        self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0)
+
+        # reset the config
+        self.config_set('client', 'client inject fixed oldest tid', 'false')
+
+        self.mount_a.kill_cleanup()
+        self.mount_a.mount_wait()
+
     def test_client_oldest_tid(self):
         """
         When a client does not advance its oldest tid, the MDS should notice that