]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
test: validate client eviction for cap revoke non-responders
authorVenky Shankar <vshankar@redhat.com>
Mon, 6 Aug 2018 03:37:41 +0000 (23:37 -0400)
committerVenky Shankar <vshankar@redhat.com>
Thu, 18 Oct 2018 12:36:14 +0000 (08:36 -0400)
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit c0b1dacc9f9c9b5af07a1b83a0adb53d001c2b79)

qa/tasks/cephfs/test_misc.py

index 6757b009b736e65a3ed1a66b232705c865fabd42..246d1db4e490f9f5d8ad473b2467abae552b90be 100644 (file)
@@ -2,11 +2,13 @@
 from unittest import SkipTest
 from tasks.cephfs.fuse_mount import FuseMount
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
-from teuthology.orchestra.run import CommandFailedError
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
 import errno
 import time
 import json
+import logging
 
+log = logging.getLogger(__name__)
 
 class TestMisc(CephFSTestCase):
     CLIENTS_REQUIRED = 2
@@ -129,6 +131,60 @@ class TestMisc(CephFSTestCase):
         ls_data = self.fs.mds_asok(['session', 'ls'])
         self.assert_session_count(1, ls_data)
 
+    def test_cap_revoke_nonresponder(self):
+        """
+        Check that a client is evicted if it has not responded to cap revoke
+        request for configured number of seconds.
+        """
+        session_timeout = self.fs.get_var("session_timeout")
+        eviction_timeout = session_timeout / 2.0
+
+        self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
+                          str(eviction_timeout)])
+
+        cap_holder = self.mount_a.open_background()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+
+            a = time.time()
+            time.sleep(eviction_timeout)
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+
+            # check if the cap is transferred before session timeout kicked in.
+            # this is a good enough check to ensure that the client got evicted
+            # by the cap auto evicter rather than transitioning to stale state
+            # and then getting evicted.
+            self.assertLess(cap_waited, session_timeout,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, session_timeout
+                            ))
+
+            self.assertTrue(self.mount_a.is_blacklisted())
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
     def test_filtered_df(self):
         pool_name = self.fs.get_data_pool_name()
         raw_df = self.fs.get_pool_df(pool_name)