From c0b3a1148475ea7e563b1b8c13217b6e7c85b150 Mon Sep 17 00:00:00 2001
From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 28 Jan 2019 15:48:38 -0800
Subject: [PATCH] mds: simplify recall warnings

Instead of a timeout and complicated decisions about whether the client is
releasing caps in an expeditious fashion, just use a DecayCounter that tracks
the number of caps we've recalled. This counter is decremented whenever the
client releases caps. If the counter passes a threshold, then we raise the
warning.

Similar reworking is done for the steady-state recall of client caps. Another
release DecayCounter is added so we can tell when the client is not releasing
any more caps.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
---
 PendingReleaseNotes                   |  6 +++
 qa/tasks/cephfs/test_client_limits.py |  9 ++--
 src/common/legacy_config_opts.h       |  1 -
 src/common/options.cc                 | 10 ++--
 src/mds/Beacon.cc                     | 47 ++++++-------------
 src/mds/Server.cc                     | 66 ++++++++++++++++-----------
 src/mds/Server.h                      |  2 +-
 src/mds/SessionMap.cc                 | 45 ++++++------------
 src/mds/SessionMap.h                  | 35 ++++++++------
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index e04ce58793165..50bc4831ba825 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -179,6 +179,12 @@
   MDS when all of these caps need to be processed during certain session
   events. It is recommended to not unnecessarily increase this value.
 
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
 >=13.1.0
 --------
 
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
index 833a37172569f..bc029cd8a5ab8 100644
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -42,10 +42,13 @@ class TestClientLimits(CephFSTestCase):
         cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
         mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
         self.assertTrue(open_files >= mds_min_caps_per_client)
 
         mount_a_client_id = self.mount_a.get_global_id()
@@ -63,13 +66,11 @@ class TestClientLimits(CephFSTestCase):
 
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
 
         # When the client closes the files, it should retain only as many caps as allowed
         # under the SESSION_RECALL policy
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 21d7204980473..a71d2dac51a36 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -410,7 +410,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist cl
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
 
 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
diff --git a/src/common/options.cc b/src/common/options.cc
index 547b8b8757a33..245cac9af8f12 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -7247,9 +7247,13 @@ std::vector<Option> get_mds_options() {
     .set_default(64_K)
     .set_description("decay threshold for throttle on recalled caps globally"),
 
-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description("timeout for clients late on cap recall to create health warnings"),
+    Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
 
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(30)
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index d2bb5cac6f214..07330520c50a5 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -373,45 +373,28 @@ void Beacon::notify_health(MDSRank const *mds)
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
-    auto now = clock::now();
-
-    const auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
-    const auto last_recall = mds->server->last_recalled();
-    const auto last_recall_span = std::chrono::duration<double>(now-last_recall).count();
-    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
 
+    const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf()->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (auto& session : sessions) {
-      const auto& recall_release_count = session->recall_release_count;
-      const auto& recall_count = session->recall_count;
-      if (recall_release_count < recall_count) {
-        const auto& recalled_at = session->recalled_at;
-        const auto& released_at = session->released_at;
-        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
-        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
-
-        dout(20) << "Session " << session->info.inst
-          << " last released " << recall_release_count << "/" << recall_count << " caps "
-          << released_at_span << "s ago " << dendl;
-	if (recall_state_timedout) {
-	  dout(20) << "  no longer recall" << dendl;
-          session->clear_recalled();
-	} else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->get_client());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = session->get_recall_caps();
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
       }
       if ((session->get_num_trim_requests_warnings() > 0 &&
-	   session->get_num_completed_requests() >= g_conf()->mds_max_completed_requests) ||
+	   session->get_num_completed_requests() >= max_completed_requests) ||
 	  (session->get_num_trim_flushes_warnings() > 0 &&
-	   session->get_num_completed_flushes() >= g_conf()->mds_max_completed_flushes)) {
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
 	std::ostringstream oss;
 	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
 	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index ead97a5f4bd79..455a6d0ef4872 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -191,7 +191,7 @@ Server::Server(MDSRank *m) :
   failed_reconnects(0),
   reconnect_evicting(false),
   terminating_sessions(false),
-  recall_counter(g_conf().get_val<double>("mds_recall_max_decay_rate"))
+  recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
 {
   supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
 }
@@ -1076,7 +1076,7 @@ void Server::handle_conf_change(const ConfigProxy& conf,
             << cap_revoke_eviction_timeout << dendl;
   }
   if (changed.count("mds_recall_max_decay_rate")) {
-    recall_counter = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+    recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
   }
 }
 
@@ -1533,7 +1533,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
   const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
   const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
 
-  dout(10) << __func__ << ":"
+  dout(7) << __func__ << ":"
            << " min=" << min_caps_per_client
            << " max=" << max_caps_per_client
            << " total=" << Capability::count()
@@ -1559,7 +1559,8 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
 	!session->info.inst.name.is_client())
       continue;
 
-    dout(10) << " session " << session->info.inst
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
 	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;
@@ -1574,36 +1575,49 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
       /* now limit the number of caps we recall at a time to prevent overloading ourselves */
       uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
       newlim = num_caps-recall;
-      const uint64_t session_recall_counter = session->cap_recalled_counter();
-      const uint64_t global_recall_counter = recall_counter.get();
-      if (session_recall_counter+recall > recall_max_decay_threshold) {
-        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get();
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
         throttled = true;
         continue;
-      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
-        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
         throttled = true;
         break;
       }
 
       // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
-      const auto& recalled_at = session->recalled_at;
-      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
-      const auto& released_at = session->released_at;
-      const auto& last_recall_count = session->recall_count;
-      const auto& last_recall_release_count = session->recall_release_count;
-      const auto& last_recall_limit = session->recall_limit;
-      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
-      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
-        /* The session has recently (1hr) been asked to release caps and we
-         * were unable to get at least half of the recalled caps.
-         */
-        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
-                 << " caps " << last_recalled << "s ago; released "
-                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
-        continue;
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
       }
 
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
       auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
@@ -1611,7 +1625,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
         flush_session(session, gather);
       }
       caps_recalled += session->notify_recall_sent(newlim);
-      recall_counter.hit(recall);
+      recall_throttle.hit(recall);
     }
   }
 
diff --git a/src/mds/Server.h b/src/mds/Server.h
index fc504e3249566..6bb84b0615139 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -350,7 +350,7 @@ private:
   void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
   void flush_session(Session *session, MDSGatherBuilder *gather);
 
-  DecayCounter recall_counter;
+  DecayCounter recall_throttle;
   time last_recall_state;
 };
 
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index bf30a0fed9182..340ad0fa54143 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -855,13 +855,8 @@ size_t Session::get_request_count()
  */
 void Session::notify_cap_release(size_t n_caps)
 {
-  recall_release_count += n_caps;
-  if (n_caps > 0) {
-    released_at = clock::now();
-    if (recall_count <= recall_release_count) {
-      clear_recalled();
-    }
-  }
+  recall_caps.hit(-(double)n_caps);
+  release_caps.hit(n_caps);
 }
 
 /**
@@ -870,25 +865,13 @@ void Session::notify_cap_release(size_t n_caps)
  * in order to generate health metrics if the session doesn't see
  * a commensurate number of calls to ::notify_cap_release
  */
-uint64_t Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
 {
   const auto num_caps = caps.size();
-  ceph_assert(new_limit < num_caps);
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
   const auto count = num_caps-new_limit;
-
-  /* Entering recall phase, set up counters so we can later judge whether the
-   * client has respected the recall request. Update only if client has not
-   * released caps from a previous recall.
-   */
-
   uint64_t new_change;
   if (recall_limit != new_limit) {
-    const auto now = clock::now();
-    recalled_at = now;
-    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
-    recall_count = count;
-    recall_release_count = 0;
-    recall_limit = new_limit;
     new_change = count;
   } else {
     new_change = 0; /* no change! */
@@ -899,17 +882,11 @@ uint64_t Session::notify_recall_sent(const size_t new_limit)
    * session that is not releasing caps (i.e. allow the session counter to
    * throttle future RECALL messages).
    */
-  cap_recalled.hit(count);
+  recall_caps_throttle.hit(count);
+  recall_caps.hit(count);
   return new_change;
 }
 
-void Session::clear_recalled()
-{
-  recall_count = 0;
-  recall_release_count = 0;
-  recall_limit = 0;
-}
-
 /**
  * Use client metadata to generate a somewhat-friendlier
  * name for the client than its session ID.
@@ -1031,7 +1008,15 @@ void SessionMap::handle_conf_change(const ConfigProxy &conf,
   if (changed.count("mds_recall_max_decay_rate")) {
     auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
     auto mut = [d](auto s) {
-      s->cap_recalled = DecayCounter(d);
+      s->recall_caps_throttle = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps = DecayCounter(d);
+      s->release_caps = DecayCounter(d);
     };
     apply_to_open_sessions(mut);
   }
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index e3530f9566118..b7f747684136c 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -113,8 +113,15 @@ private:
   // request load average for this session
   DecayCounter load_avg;
 
-  // caps being recalled recently by this session
-  DecayCounter cap_recalled;
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  DecayCounter recall_caps;
+  // caps that have been released
+  DecayCounter release_caps;
+  // throttle on caps recalled
+  DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
 
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
@@ -156,13 +163,6 @@ public:
 
   const std::string& get_human_name() const {return human_name;}
 
-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = clock::zero();  // When was I first asked to SESSION_RECALL?
-  time released_at = clock::zero();  // When did the session last release caps?
-  uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
-  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
-
   session_info_t info;                         ///< durable bits
 
   MDSAuthCaps auth_caps;
@@ -181,11 +181,16 @@ public:
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
   void notify_cap_release(size_t n_caps);
-  uint64_t notify_recall_sent(const size_t new_limit);
-  auto cap_recalled_counter() const {
-    return cap_recalled.get();
+  uint64_t notify_recall_sent(size_t new_limit);
+  auto get_recall_caps_throttle() const {
+    return recall_caps_throttle.get();
+  }
+  auto get_recall_caps() const {
+    return recall_caps.get();
+  }
+  auto get_release_caps() const {
+    return release_caps.get();
   }
-  void clear_recalled();
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -384,7 +389,9 @@ public:
 
   Session() = delete;
   Session(ConnectionRef con) :
-    cap_recalled(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+    recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
     birth_time(clock::now()),
     auth_caps(g_ceph_context),
     item_session_list(this),
-- 
2.39.5