From ef46216d8d0b659549925481b4eff6bd7d2c43c9 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Wed, 23 Jan 2019 06:41:55 -0800 Subject: [PATCH] mds: recall caps incrementally As with trimming, use DecayCounters to throttle the number of caps we recall, both globally and per-session. Signed-off-by: Patrick Donnelly --- PendingReleaseNotes | 4 + .../client_trim_caps/tasks/trim-i22073.yaml | 1 - qa/tasks/cephfs/test_client_limits.py | 4 +- src/common/options.cc | 20 +++- src/mds/Beacon.cc | 38 +++--- src/mds/MDCache.cc | 3 +- src/mds/MDCache.h | 2 - src/mds/MDSDaemon.cc | 4 +- src/mds/MDSRank.cc | 13 +- src/mds/Server.cc | 113 ++++++++++++------ src/mds/Server.h | 16 ++- src/mds/SessionMap.cc | 87 +++++++++----- src/mds/SessionMap.h | 40 +++++-- 13 files changed, 232 insertions(+), 113 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 61424bc62f4..c9b869a3602 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -169,6 +169,10 @@ via the `ceph tell mds. cache drop` command or large reductions in the cache size will no longer cause service unavailability. +* The CephFS MDS behavior with recalling caps has been significantly improved + to not attempt recalling too many caps at once, leading to instability. + MDS with a large cache (64GB+) should be more stable. + >=13.1.0 -------- diff --git a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml index 410606225f0..f0ed3366c75 100644 --- a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml +++ b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml @@ -10,7 +10,6 @@ overrides: tasks: - exec: mon.a: - - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1" - "ceph tell mds.* config set mds_min_caps_per_client 1" - background_exec: mon.a: diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index 1f1d5467079..322bd8c895e 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -47,7 +47,6 @@ class TestClientLimits(CephFSTestCase): mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client")) self.assertTrue(open_files >= mds_min_caps_per_client) - mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client")) mount_a_client_id = self.mount_a.get_global_id() path = "subdir/mount_a" if use_subdir else "mount_a" @@ -84,14 +83,13 @@ class TestClientLimits(CephFSTestCase): # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message, # which depend on the caps outstanding, cache size and overall ratio - recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2)) def expected_caps(): num_caps = self.get_session(mount_a_client_id)['num_caps'] if num_caps < mds_min_caps_per_client: raise RuntimeError("client caps fell below min!") elif num_caps == mds_min_caps_per_client: return True - elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05: + elif num_caps < cache_size: return True else: return False diff --git a/src/common/options.cc b/src/common/options.cc index 8d03bbc5c25..090e126fa78 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -7231,6 +7231,22 @@ std::vector