From: Patrick Donnelly Date: Mon, 28 Jan 2019 23:48:38 +0000 (-0800) Subject: mds: simplify recall warnings X-Git-Tag: v12.2.12~63^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d0fadaac75af5e9c1b3b2362714d7934d25d6d0c;p=ceph.git mds: simplify recall warnings Instead of a timeout and complicated decisions about whether the client is releasing caps in an expeditious fashion, just use a DecayCounter that tracks the number of caps we've recalled. This counter is decremented whenever the client releases caps. If the counter passes a threshold, then we raise the warning. Similar reworking is done for the steady-state recall of client caps. Another release DecayCounter is added so we can tell when the client is not releasing any more caps. Signed-off-by: Patrick Donnelly (cherry picked from commit c0b3a1148475ea7e563b1b8c13217b6e7c85b150) Conflicts: PendingReleaseNotes src/common/options.cc src/mds/Beacon.cc src/mds/Server.cc src/mds/SessionMap.cc src/mds/SessionMap.h --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 839961979acc..be06785b63e7 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -26,6 +26,12 @@ MDS when all of these caps need to be processed during certain session events. It is recommended to not unnecessarily increase this value. +* The MDS config mds_recall_state_timeout has been removed. Late client recall + warnings are now generated based on the number of caps the MDS has recalled + which have not been released. The new configs mds_recall_warning_threshold + (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the + threshold for this warning. + >= 12.1.2 --------- * When running 'df' on a CephFS filesystem comprising exactly one data pool, diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index 833a37172569..bc029cd8a5ab 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -42,10 +42,13 @@ class TestClientLimits(CephFSTestCase): cache_size = open_files/2 self.set_conf('mds', 'mds cache size', cache_size) + self.set_conf('mds', 'mds_recall_max_caps', open_files/2) + self.set_conf('mds', 'mds_recall_warning_threshold', open_files) self.fs.mds_fail_restart() self.fs.wait_for_daemons() mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client")) + mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate") self.assertTrue(open_files >= mds_min_caps_per_client) mount_a_client_id = self.mount_a.get_global_id() @@ -63,13 +66,11 @@ class TestClientLimits(CephFSTestCase): # MDS should not be happy about that, as the client is failing to comply # with the SESSION_RECALL messages it is being sent - mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout")) - self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10) + self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2) # We can also test that the MDS health warning for oversized # cache is functioning as intended. - self.wait_for_health("MDS_CACHE_OVERSIZED", - mds_recall_state_timeout + 10) + self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2) # When the client closes the files, it should retain only as many caps as allowed # under the SESSION_RECALL policy diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h index 828697758124..2b4361103f32 100644 --- a/src/common/legacy_config_opts.h +++ b/src/common/legacy_config_opts.h @@ -443,7 +443,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist cl OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation? -OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many' OPTION(mds_reconnect_timeout, OPT_FLOAT) // seconds to wait for clients during mds restart diff --git a/src/common/options.cc b/src/common/options.cc index 40efed5ac0ca..b54374b86206 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -6166,9 +6166,13 @@ std::vector