]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: add test cases to check client eviction if an OSD is laggy
authorDhairya Parmar <dparmar@redhat.com>
Wed, 12 Apr 2023 10:52:49 +0000 (16:22 +0530)
committerDhairya Parmar <dparmar@redhat.com>
Mon, 23 Oct 2023 05:29:10 +0000 (10:59 +0530)
Signed-off-by: Dhairya Parmar <dparmar@redhat.com>
(cherry picked from commit 51cca9b9dc085589eadb9534d27130de0d2c7162)

qa/tasks/cephfs/test_client_recovery.py

index 1bd6884a9dcc4b62bf13db0e477dd611466c0bcd..5e54c2f8906389f0c1e50854fbc10a3da4d4995c 100644 (file)
@@ -4,6 +4,7 @@ Teuthology task for exercising CephFS client recovery
 """
 
 import logging
+import signal
 from textwrap import dedent
 import time
 import distutils.version as version
@@ -755,3 +756,109 @@ class TestClientRecovery(CephFSTestCase):
             self.assertEqual(len(self.fs.mds_tell(["session", "ls"])), 0)
         self.mount_a.umount_wait(force=True)
 
+
+class TestClientOnLaggyOSD(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    def make_osd_laggy(self, osd, sleep=120):
+        self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGSTOP)
+        time.sleep(sleep)
+        self.mds_cluster.mon_manager.signal_osd(osd, signal.SIGCONT)
+
+    def clear_laggy_params(self, osd):
+        default_laggy_weight = self.config_get('mon', 'mon_osd_laggy_weight')
+        self.config_set('mon', 'mon_osd_laggy_weight', 1)
+        self.mds_cluster.mon_manager.revive_osd(osd)
+        self.config_set('mon', 'mon_osd_laggy_weight', default_laggy_weight)
+
+    def get_a_random_osd(self):
+        osds = self.mds_cluster.mon_manager.get_osd_status()
+        return random.choice(osds['live'])
+
+    def test_client_eviction_if_config_is_set(self):
+        """
+        If any client gets unresponsive/it's session get idle due to lagginess
+        with any OSD and if config option defer_client_eviction_on_laggy_osds
+        is set true(default true) then make sure clients are not evicted until
+        OSD(s) return to normal.
+        """
+
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'true')
+        self.assertEqual(self.config_get(
+            'mds', 'defer_client_eviction_on_laggy_osds'), 'true')
+
+        # make an OSD laggy
+        osd = self.get_a_random_osd()
+        self.make_osd_laggy(osd)
+
+        try:
+            mount_a_gid = self.mount_a.get_global_id()
+
+            self.mount_a.kill()
+
+            # client session should be open, it gets stale
+            # only after session_timeout time.
+            self.assert_session_state(mount_a_gid, "open")
+
+            # makes session stale
+            time.sleep(self.fs.get_var("session_timeout") * 1.5)
+            self.assert_session_state(mount_a_gid, "stale")
+
+            # it takes time to have laggy clients entries in cluster log,
+            # wait for 6 minutes to see if it is visible, finally restart
+            # the client
+            tries = 6
+            while True:
+                try:
+                    with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
+                                                 timeout=55):
+                        # make sure clients weren't evicted
+                        self.assert_session_count(2)
+                        break
+                except AssertionError:
+                    tries -= 1
+                    if tries:
+                        continue
+                    raise
+        finally:
+            self.mount_a.kill_cleanup()
+            self.mount_a.mount_wait()
+            self.mount_a.create_destroy()
+            self.clear_laggy_params(osd)
+
+    def test_client_eviction_if_config_is_unset(self):
+        """
+        If an OSD is laggy but config option defer_client_eviction_on_laggy_osds
+        is unset then an unresponsive client does get evicted.
+        """
+
+        self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false'])
+        self.config_set('mds', 'defer_client_eviction_on_laggy_osds', 'false')
+        self.assertEqual(self.config_get(
+            'mds', 'defer_client_eviction_on_laggy_osds'), 'false')
+
+        # make an OSD laggy
+        osd = self.get_a_random_osd()
+        self.make_osd_laggy(osd)
+
+        try:
+            session_timeout = self.fs.get_var("session_timeout")
+            mount_a_gid = self.mount_a.get_global_id()
+
+            self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)])
+
+            self.mount_a.kill()
+
+            self.assert_session_count(2)
+
+            time.sleep(session_timeout * 1.5)
+            self.assert_session_state(mount_a_gid, "open")
+
+            time.sleep(session_timeout)
+            self.assert_session_count(1)
+        finally:
+            self.mount_a.kill_cleanup()
+            self.mount_a.mount_wait()
+            self.mount_a.create_destroy()
+            self.clear_laggy_params(osd)