From f66b52806a7107465886e9b4a6fae83f517fdca0 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Tue, 20 Oct 2020 01:26:33 -0400
Subject: [PATCH] qa/cephfs: add session_timeout option support

When the mds revoking the Fwbl caps, the clients need to flush
the dirty data back to the OSDs, but the flush may make the OSDs
to be overloaded and slow, which may take more than 60 seconds to
finish. Then the MDS daemons will report the WRN messages.

For the teuthology test cases, let's just increase the timeout
value to make it work.

Fixes: https://tracker.ceph.com/issues/47565
Signed-off-by: Xiubo Li <xiubli@redhat.com>
(cherry picked from commit 0422673b6150df851a4ea1662637a77585cde52d)
---
 qa/cephfs/overrides/session_timeout.yaml                  | 4 ++++
 .../fs/basic_workload/overrides/session_timeout.yaml      | 1 +
 qa/suites/fs/thrash/overrides/session_timeout.yaml        | 1 +
 qa/suites/fs/verify/overrides/session_timeout.yaml        | 1 +
 qa/tasks/ceph.py                                          | 7 +++++++
 qa/tasks/cephfs/filesystem.py                             | 8 ++++++++
 qa/tasks/vstart_runner.py                                 | 1 +
 7 files changed, 23 insertions(+)
 create mode 100644 qa/cephfs/overrides/session_timeout.yaml
 create mode 120000 qa/suites/fs/basic_workload/overrides/session_timeout.yaml
 create mode 120000 qa/suites/fs/thrash/overrides/session_timeout.yaml
 create mode 120000 qa/suites/fs/verify/overrides/session_timeout.yaml

diff --git a/qa/cephfs/overrides/session_timeout.yaml b/qa/cephfs/overrides/session_timeout.yaml
new file mode 100644
index 0000000000000..a7a1633371f3b
--- /dev/null
+++ b/qa/cephfs/overrides/session_timeout.yaml
@@ -0,0 +1,4 @@
+overrides:
+  ceph:
+    cephfs:
+      session_timeout: 300
diff --git a/qa/suites/fs/basic_workload/overrides/session_timeout.yaml b/qa/suites/fs/basic_workload/overrides/session_timeout.yaml
new file mode 120000
index 0000000000000..fce0318c58936
--- /dev/null
+++ b/qa/suites/fs/basic_workload/overrides/session_timeout.yaml
@@ -0,0 +1 @@
+.qa/cephfs/overrides/session_timeout.yaml
\ No newline at end of file
diff --git a/qa/suites/fs/thrash/overrides/session_timeout.yaml b/qa/suites/fs/thrash/overrides/session_timeout.yaml
new file mode 120000
index 0000000000000..fce0318c58936
--- /dev/null
+++ b/qa/suites/fs/thrash/overrides/session_timeout.yaml
@@ -0,0 +1 @@
+.qa/cephfs/overrides/session_timeout.yaml
\ No newline at end of file
diff --git a/qa/suites/fs/verify/overrides/session_timeout.yaml b/qa/suites/fs/verify/overrides/session_timeout.yaml
new file mode 120000
index 0000000000000..fce0318c58936
--- /dev/null
+++ b/qa/suites/fs/verify/overrides/session_timeout.yaml
@@ -0,0 +1 @@
+.qa/cephfs/overrides/session_timeout.yaml
\ No newline at end of file
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index e9d9a4a3df0a8..31194e0339771 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -1687,6 +1687,13 @@ def task(ctx, config):
             cephfs:
               max_mds: 2
 
+    To change the mdsmap's default session_timeout (60 seconds), use::
+
+        tasks:
+        - ceph:
+            cephfs:
+              session_timeout: 300
+
     Note, this will cause the task to check the /scratch_devs file on each node
     for available devices.  If no such file is found, /dev/sdb will be used.
 
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index 91895a3ea7cb1..bf337f84fc9cb 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -548,6 +548,9 @@ class Filesystem(MDSCluster):
     def set_max_mds(self, max_mds):
         self.set_var("max_mds", "%d" % max_mds)
 
+    def set_session_timeout(self, timeout):
+        self.set_var("session_timeout", "%d" % timeout)
+
     def set_allow_standby_replay(self, yes):
         self.set_var("allow_standby_replay", yes)
 
@@ -615,6 +618,11 @@ class Filesystem(MDSCluster):
             if max_mds > 1:
                 self.set_max_mds(max_mds)
 
+            # If absent will use the default value (60 seconds)
+            session_timeout = self.fs_config.get('session_timeout', 60)
+            if session_timeout != 60:
+                self.set_session_timeout(session_timeout)
+
         self.getinfo(refresh = True)
 
         
diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py
index a7f976d8ea5e2..50f5cc48f2f3e 100644
--- a/qa/tasks/vstart_runner.py
+++ b/qa/tasks/vstart_runner.py
@@ -1119,6 +1119,7 @@ class LocalFilesystem(Filesystem, LocalMDSCluster):
         self.metadata_overlay = False
         self.data_pool_name = None
         self.data_pools = None
+        self.fs_config = None
 
         # Hack: cheeky inspection of ceph.conf to see what MDSs exist
         self.mds_ids = set()
-- 
2.39.5