From: Patrick Donnelly <pdonnell@redhat.com>
Date: Wed, 23 Jan 2019 14:41:55 +0000 (-0800)
Subject: mds: recall caps incrementally
X-Git-Tag: v12.2.12~63^2~7
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4b588f430ab088530e7a4fcab00d06223b1f340d;p=ceph.git

mds: recall caps incrementally

As with trimming, use DecayCounters to throttle the number of caps we recall,
both globally and per-session.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit ef46216d8d0b659549925481b4eff6bd7d2c43c9)

Conflicts:
	PendingReleaseNotes
	src/common/options.cc
	src/mds/Beacon.cc
	src/mds/Server.cc
	src/mds/Server.h
	src/mds/SessionMap.cc
	src/mds/SessionMap.h
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 88add63e244..1218bef68e7 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -16,6 +16,10 @@
   via the `ceph tell mds.<foo> cache drop` command or large reductions in the
   cache size will no longer cause service unavailability.
 
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
 >= 12.1.2
 ---------
 * When running 'df' on a CephFS filesystem comprising exactly one data pool,
diff --git a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
index 410606225f0..f0ed3366c75 100644
--- a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
+++ b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
@@ -10,7 +10,6 @@ overrides:
 tasks:
 - exec:
     mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
     - "ceph tell mds.* config set mds_min_caps_per_client 1"
 - background_exec:
     mon.a:
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
index 1f1d5467079..322bd8c895e 100644
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -47,7 +47,6 @@ class TestClientLimits(CephFSTestCase):
 
         mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
         self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
 
         mount_a_client_id = self.mount_a.get_global_id()
         path = "subdir/mount_a" if use_subdir else "mount_a"
@@ -84,14 +83,13 @@ class TestClientLimits(CephFSTestCase):
 
         # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
         # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
         def expected_caps():
             num_caps = self.get_session(mount_a_client_id)['num_caps']
             if num_caps < mds_min_caps_per_client:
                 raise RuntimeError("client caps fell below min!")
             elif num_caps == mds_min_caps_per_client:
                 return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                 return True
             else:
                 return False
diff --git a/src/common/options.cc b/src/common/options.cc
index 57468a33208..3497b06a1fa 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -6150,6 +6150,22 @@ std::vector<Option> get_mds_options() {
     .set_default(1024)
     .set_description(""),
 
+    Option("mds_recall_max_caps", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
     Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(60)
     .set_description(""),
@@ -6526,9 +6542,6 @@ std::vector<Option> get_mds_options() {
     .set_default(100)
     .set_description("minimum number of capabilities a client may hold"),
 
-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
     Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
      .set_default(0)
      .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 891487f2abb..60dbdda20d5 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -387,35 +387,39 @@ void Beacon::notify_health(MDSRank const *mds)
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
+    auto now = clock::now();
 
-    auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
+    const auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
+    const auto last_recall = mds->server->last_recalled();
+    const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
+    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
 
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (auto& session : sessions) {
-      if (session->recalled_at != Session::time::min()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-	if (recall_state_timedout || last_recall_sent < last_recall) {
+      const auto& recall_release_count = session->recall_release_count;
+      const auto& recall_count = session->recall_count;
+      if (recall_release_count < recall_count) {
+        const auto& recalled_at = session->recalled_at;
+        const auto& released_at = session->released_at;
+        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
+        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
+
+        dout(20) << "Session " << session->info.inst
+          << " last released " << recall_release_count << "/" << recall_count << " caps "
+          << released_at_span << "s ago " << dendl;
+	if (recall_state_timedout) {
 	  dout(20) << "  no longer recall" << dendl;
-	  session->clear_recalled_at();
-	} else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          session->clear_recalled();
+	} else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
+          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
           std::ostringstream oss;
 	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
           MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
           m.metadata["client_id"] = stringify(session->info.inst.name.num());
           late_recall_metrics.push_back(m);
         } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
+          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
         }
       }
       if ((session->get_num_trim_requests_warnings() > 0 &&
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index f36ba40002d..6a44598f176 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7545,8 +7545,7 @@ void MDCache::check_memory_usage()
   mds->mlogger->set(l_mdm_heap, last.get_heap());
 
   if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
   }
 
   // If the cache size had exceeded its limit, but we're back in bounds
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index b791363bf4f..2d1e8463aa9 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -760,8 +760,6 @@ public:
   void trim_client_leases();
   void check_memory_usage();
 
-  time last_recall_state;
-
   // shutdown
 private:
   set<inodeno_t> shutdown_exporting_strays;
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index 8fbaa2f2ac1..e312bf7cc22 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -387,8 +387,10 @@ const char** MDSDaemon::get_tracked_conf_keys() const
     "mds_inject_migrator_message_loss",
     "host",
     "fsid",
-    "mds_request_load_average_decay_rate",
     "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
     NULL
   };
   return KEYS;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index d27251d6cf7..bf1ce74b324 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -248,7 +248,8 @@ public:
                Formatter *f, Context *on_finish)
     : MDSInternalContext(mds),
       server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
       whoami(mds->whoami), incarnation(mds->incarnation) {
   }
 
@@ -320,11 +321,16 @@ private:
     f->open_object_section("result");
 
     MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
+    auto result = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    auto& throttled = result.first;
+    auto& count = result.second;
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
     if (!gather->has_subs()) {
-      handle_recall_client_state(0);
       delete gather;
-      return;
+      return handle_recall_client_state(0);
     }
 
     C_ContextTimeout *ctx = new C_ContextTimeout(
@@ -418,6 +424,7 @@ private:
   MDCache *mdcache;
   MDLog *mdlog;
   uint64_t recall_timeout;
+  mono_time recall_start;
   Formatter *f;
   Context *on_finish;
 
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index e0a1cc47812..315aa4db97c 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -17,6 +17,7 @@
 
 #include <boost/config/warning_disable.hpp>
 #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 #include "MDSRank.h"
 #include "Server.h"
@@ -58,6 +59,7 @@
 #include "osd/OSDMap.h"
 
 #include <errno.h>
+#include <math.h>
 
 #include <list>
 #include <iostream>
@@ -199,7 +201,8 @@ Server::Server(MDSRank *m) :
   reconnect_done(NULL),
   failed_reconnects(0),
   reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_counter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
 {
 }
 
@@ -883,6 +886,9 @@ void Server::handle_conf_change(const struct md_config_t *conf,
     dout(20) << __func__ << " cap revoke eviction timeout changed to "
             << cap_revoke_eviction_timeout << dendl;
   }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+  }
 }
 
 /*
@@ -1216,62 +1222,102 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
   }
 }
 
-
 /**
  * Call this when the MDCache is oversized, to send requests to the clients
  * to trim some caps, and consequently unpin some inodes in the MDCache so
  * that it can trim too.
  */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
-
-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf->get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
-
-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
-
-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
-
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
-
-  for (auto &session : sessions) {
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+
+  const auto min_caps_per_client = g_conf->get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf->get_val<uint64_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf->get_val<uint64_t>("mds_recall_max_decay_threshold");
+
+  dout(10) << __func__ << ": caps per client " << min_caps_per_client << "/" << Capability::count() << dendl;
+
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session](auto& s) {
+    caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(s->caps.size()), std::forward_as_tuple(s));
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));
+
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& throttled = result.first;
+  auto& caps_recalled = result.second;
+  last_recall_state = now;
+  for (const auto p : boost::adaptors::reverse(caps_session)) {
+    auto& num_caps = p.first;
+    auto& session = p.second;
     if (!session->is_open() ||
         !session->connection.get() ||
 	!session->info.inst.name.is_client())
       continue;
 
     dout(10) << " session " << session->info.inst
-	     << " caps " << session->caps.size()
+	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;
 
-    uint64_t newlim = MAX(MIN((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
-      MClientSession *m = new MClientSession(CEPH_SESSION_RECALL_STATE);
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_counter = session->cap_recalled_counter();
+      const uint64_t global_recall_counter = recall_counter.get(ceph_clock_now());
+      if (session_recall_counter+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      const auto& recalled_at = session->recalled_at;
+      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
+      const auto& released_at = session->released_at;
+      const auto& last_recall_count = session->recall_count;
+      const auto& last_recall_release_count = session->recall_release_count;
+      const auto& last_recall_limit = session->recall_limit;
+      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
+      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
+        /* The session has recently (1hr) been asked to release caps and we
+         * were unable to get at least half of the recalled caps.
+         */
+        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
+                 << " caps " << last_recalled << "s ago; released "
+                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
+        continue;
+      }
+
+      auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
         flush_session(session, gather);
       }
       session->notify_recall_sent(newlim);
+      recall_counter.hit(ceph_clock_now(), recall);
+      caps_recalled += recall;
     }
   }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
 }
 
 void Server::force_clients_readonly()
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 4fff7ae6fe1..ab16a55bca2 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -17,6 +17,8 @@
 
 #include <boost/utility/string_view.hpp>
 
+#include <common/DecayCounter.h>
+
 #include "MDSRank.h"
 #include "Mutation.h"
 
@@ -121,6 +123,9 @@ public:
   void dump_reconnect_status(Formatter *f) const;
 
   void handle_client_session(class MClientSession *m);
+  time last_recalled() const {
+    return last_recall_state;
+  }
   void _session_logged(Session *session, uint64_t state_seq, 
 		       bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
   version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
@@ -141,8 +146,11 @@ public:
   void reconnect_tick();
   void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
 
-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
   void force_clients_readonly();
 
   // -- requests --
@@ -323,6 +331,9 @@ public:
 private:
   void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
   void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_counter;
+  time last_recall_state;
 };
 
 #endif
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index a3ddd8e3aea..8b58295e45e 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -859,10 +859,12 @@ size_t Session::get_request_count()
  */
 void Session::notify_cap_release(size_t n_caps)
 {
-  if (recalled_at != time::min()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
+  recall_release_count += n_caps;
+  if (n_caps > 0) {
+    released_at = clock::now();
+    if (recall_count <= recall_release_count) {
+      clear_recalled();
+    }
   }
 }
 
@@ -874,23 +876,36 @@ void Session::notify_cap_release(size_t n_caps)
  */
 void Session::notify_recall_sent(const size_t new_limit)
 {
-  if (recalled_at == time::min()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
+  const auto num_caps = caps.size();
+  const auto count = num_caps-new_limit;
+
+  /* Entering recall phase, set up counters so we can later judge whether the
+   * client has respected the recall request. Update only if client has not
+   * released caps from a previous recall.
+   */
+
+  if (recall_limit != new_limit) {
+    const auto now = clock::now();
+    recalled_at = now;
+    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
+    recall_count = count;
     recall_release_count = 0;
-  } else {
-    last_recall_sent = clock::now();
+    recall_limit = new_limit;
   }
+
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  cap_recalled.hit(count);
 }
 
-void Session::clear_recalled_at()
+void Session::clear_recalled()
 {
-  recalled_at = last_recall_sent = time::min();
   recall_count = 0;
   recall_release_count = 0;
+  recall_limit = 0;
 }
 
 /**
@@ -983,23 +998,41 @@ void SessionMap::hit_session(Session *session) {
 }
 
 void SessionMap::handle_conf_change(const struct md_config_t *conf,
-                                    const std::set <std::string> &changed) {
+                                    const std::set <std::string> &changed)
+{
+
   if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = conf->get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
+    auto d = g_conf->get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;
 
-    total_load_avg_rate = DecayRate(decay_rate);
+    decay_rate = d;
+    total_load_avg = DecayCounter(ceph_clock_now(), d);
 
-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->set_load_avg_decay_rate(d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
+    auto it = by_state.find(Session::STATE_OPEN);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->cap_recalled = DecayCounter(ceph_clock_now(), d);
       }
     }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
+    it = by_state.find(Session::STATE_STALE);
+    if (it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->cap_recalled = DecayCounter(ceph_clock_now(), d);
       }
     }
   }
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index 44b3d11a0a4..24cf2113f92 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -27,6 +27,7 @@ using std::set;
 #include "mdstypes.h"
 #include "mds/MDSAuthCaps.h"
 #include "common/perf_counters.h"
+#include "common/DecayCounter.h"
 
 class CInode;
 struct MDRequestImpl;
@@ -112,6 +113,9 @@ private:
   mutable DecayCounter load_avg;
   DecayRate    load_avg_rate;
 
+  // caps being recalled recently by this session
+  DecayCounter cap_recalled;
+
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
   // than at the time of adding a session to the sessionmap
@@ -151,10 +155,11 @@ public:
   std::string get_human_name() const {return human_name;}
 
   // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = time::min();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = time::min();
+  time recalled_at = time::min();  // When was I first asked to SESSION_RECALL?
+  time released_at = time::min();  // When did the session last release caps?
   uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
   uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
+  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
 
   session_info_t info;                         ///< durable bits
 
@@ -172,7 +177,10 @@ public:
 
   void notify_cap_release(size_t n_caps);
   void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  auto cap_recalled_counter() const {
+    return cap_recalled.get(ceph_clock_now());
+  }
+  void clear_recalled();
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -369,6 +377,7 @@ public:
 
   Session() = delete;
   Session(ConnectionRef con) :
+    cap_recalled(g_conf->get_val<double>("mds_recall_max_decay_rate")),
     birth_time(clock::now()),
     auth_caps(g_ceph_context),
     item_session_list(this),
@@ -511,10 +520,7 @@ public:
   void update_average_session_age();
 
   SessionMap() = delete;
-  explicit SessionMap(MDSRank *m)
-  :
-    mds(m)
-  {}
+  explicit SessionMap(MDSRank *m) : mds(m) {}
 
   ~SessionMap() override
   {
@@ -608,12 +614,20 @@ public:
 
   void dump();
 
-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-	 p != session_map.end();
-	 ++p)
-      if (p->second->info.inst.name.is_client())
-	s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+	f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](Session* s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
   }
 
   void replay_open_sessions(map<client_t,entity_inst_t>& client_map) {