From: Patrick Donnelly <pdonnell@redhat.com>
Date: Mon, 28 Jan 2019 23:48:38 +0000 (-0800)
Subject: mds: simplify recall warnings
X-Git-Tag: v13.2.7~24^2~2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=063b795d3b80d39d97ad37a1ad97e30bf7b6f22c;p=ceph.git

mds: simplify recall warnings

Instead of a timeout and complicated decisions about whether the client is
releasing caps in an expeditious fashion, just use a DecayCounter that tracks
the number of caps we've recalled. This counter is decremented whenever the
client releases caps. If the counter passes a threshold, then we raise the
warning.

Similar reworking is done for the steady-state recall of client caps. Another
release DecayCounter is added so we can tell when the client is not releasing
any more caps.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit c0b3a11)

Conflicts:
	PendingReleaseNotes
	src/mds/Beacon.cc
	src/mds/Server.cc
	src/mds/SessionMap.cc
	src/mds/SessionMap.h
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 142ac77d659fb..1a8bb18f6a5c9 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -15,6 +15,12 @@
   MDS when all of these caps need to be processed during certain session
   events. It is recommended to not unnecessarily increase this value.
 
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
 * The `cache drop` admin socket command has been removed. The `ceph tell mds.X
   cache drop` remains.
 
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
index 1dfa752be7afc..faae19efea5bc 100644
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -42,10 +42,13 @@ class TestClientLimits(CephFSTestCase):
         cache_size = open_files/2
 
         self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
         self.fs.mds_fail_restart()
         self.fs.wait_for_daemons()
 
         mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
         self.assertTrue(open_files >= mds_min_caps_per_client)
 
         mount_a_client_id = self.mount_a.get_global_id()
@@ -63,13 +66,11 @@ class TestClientLimits(CephFSTestCase):
 
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)
 
         # When the client closes the files, it should retain only as many caps as allowed
         # under the SESSION_RECALL policy
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
index 70be6ef6132f3..5645c9eb23268 100644
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@@ -415,7 +415,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist cl
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
 
 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
diff --git a/src/common/options.cc b/src/common/options.cc
index 5a8ec3484aeb2..a813c2150e53d 100644
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -6797,9 +6797,13 @@ std::vector<Option> get_mds_options() {
     .set_default(64_K)
     .set_description("decay threshold for throttle on recalled caps globally"),
 
-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description("timeout for clients late on cap recall to create health warnings"),
+    Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
 
     Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
     .set_default(30)
diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc
index 280b67cb9aa9a..48c5ed2b53625 100644
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@@ -390,45 +390,28 @@ void Beacon::notify_health(MDSRank const *mds)
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
-    auto now = clock::now();
-
-    const auto mds_recall_state_timeout = g_conf->mds_recall_state_timeout;
-    const auto last_recall = mds->server->last_recalled();
-    const auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    const bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
 
+    const auto recall_warning_threshold = g_conf->get_val<Option::size_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf->mds_max_completed_flushes;
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (auto& session : sessions) {
-      const auto& recall_release_count = session->recall_release_count;
-      const auto& recall_count = session->recall_count;
-      if (recall_release_count < recall_count) {
-        const auto& recalled_at = session->recalled_at;
-        const auto& released_at = session->released_at;
-        const auto recalled_at_span = std::chrono::duration<double>(now-recalled_at).count();
-        const auto released_at_span = std::chrono::duration<double>(now-released_at).count();
-
-        dout(20) << "Session " << session->info.inst
-          << " last released " << recall_release_count << "/" << recall_count << " caps "
-          << released_at_span << "s ago " << dendl;
-	if (recall_state_timedout) {
-	  dout(20) << "  no longer recall" << dendl;
-          session->clear_recalled();
-	} else if (released_at_span > mds_recall_state_timeout && recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->get_client());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << released_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = session->get_recall_caps();
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
       }
       if ((session->get_num_trim_requests_warnings() > 0 &&
-	   session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) ||
+	   session->get_num_completed_requests() >= max_completed_requests) ||
 	  (session->get_num_trim_flushes_warnings() > 0 &&
-	   session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) {
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
 	std::ostringstream oss;
 	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid. ";
 	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index fbfdb6a5bc198..1ffc6a40883c0 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -201,7 +201,7 @@ Server::Server(MDSRank *m) :
   failed_reconnects(0),
   reconnect_evicting(false),
   terminating_sessions(false),
-  recall_counter(g_conf->get_val<double>("mds_recall_max_decay_rate"))
+  recall_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"))
 {
   cap_revoke_eviction_timeout = g_conf->get_val<double>("mds_cap_revoke_eviction_timeout");
   supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
@@ -922,7 +922,7 @@ void Server::handle_conf_change(const struct md_config_t *conf,
             << cap_revoke_eviction_timeout << dendl;
   }
   if (changed.count("mds_recall_max_decay_rate")) {
-    recall_counter = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
+    recall_throttle = DecayCounter(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate"));
   }
 }
 
@@ -1374,7 +1374,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
   const auto recall_max_caps = g_conf->get_val<Option::size_t>("mds_recall_max_caps");
   const auto recall_max_decay_threshold = g_conf->get_val<Option::size_t>("mds_recall_max_decay_threshold");
 
-  dout(10) << __func__ << ":"
+  dout(7) << __func__ << ":"
            << " min=" << min_caps_per_client
            << " max=" << max_caps_per_client
            << " total=" << Capability::count()
@@ -1403,7 +1403,8 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
 	!session->info.inst.name.is_client())
       continue;
 
-    dout(10) << " session " << session->info.inst
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
 	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;
@@ -1418,36 +1419,49 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
       /* now limit the number of caps we recall at a time to prevent overloading ourselves */
       uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
       newlim = num_caps-recall;
-      const uint64_t session_recall_counter = session->cap_recalled_counter();
-      const uint64_t global_recall_counter = recall_counter.get();
-      if (session_recall_counter+recall > recall_max_decay_threshold) {
-        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_counter << "; skipping!" << dendl;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get(ceph_clock_now());
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
         throttled = true;
         continue;
-      } else if (global_recall_counter+recall > recall_global_max_decay_threshold) {
-        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_counter << "; skipping!" << dendl;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
         throttled = true;
         break;
       }
 
       // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
-      const auto& recalled_at = session->recalled_at;
-      auto last_recalled = std::chrono::duration<double>(now-recalled_at).count();
-      const auto& released_at = session->released_at;
-      const auto& last_recall_count = session->recall_count;
-      const auto& last_recall_release_count = session->recall_release_count;
-      const auto& last_recall_limit = session->recall_limit;
-      bool limit_similarity = (abs((double)num_caps-last_recall_limit+recall_max_caps)/(num_caps+recall_max_caps)) < 0.05;
-      if (last_recalled < 3600.0 && released_at < recalled_at && last_recall_count > 2*last_recall_release_count && limit_similarity && steady) {
-        /* The session has recently (1hr) been asked to release caps and we
-         * were unable to get at least half of the recalled caps.
-         */
-        dout(15) << "  last recalled " << last_recall_count << "/" << (last_recall_count+last_recall_limit)
-                 << " caps " << last_recalled << "s ago; released "
-                 << last_recall_release_count << " caps. Skipping because we are unlikely to get more released." << dendl;
-        continue;
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
       }
 
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
       auto m = new MClientSession(CEPH_SESSION_RECALL_STATE);
       m->head.max_caps = newlim;
       mds->send_message_client(m, session);
@@ -1455,7 +1469,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
         flush_session(session, gather);
       }
       caps_recalled += session->notify_recall_sent(newlim);
-      recall_counter.hit(ceph_clock_now(), recall);
+      recall_throttle.hit(ceph_clock_now(), recall);
     }
   }
 
diff --git a/src/mds/Server.h b/src/mds/Server.h
index 85761b55be88b..66c98e4cf8b28 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -344,7 +344,7 @@ private:
   void reply_client_request(MDRequestRef& mdr, MClientReply *reply);
   void flush_session(Session *session, MDSGatherBuilder *gather);
 
-  DecayCounter recall_counter;
+  DecayCounter recall_throttle;
   time last_recall_state;
 };
 
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index 598ad653f1444..68e1c50ecd14a 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -896,13 +896,8 @@ size_t Session::get_request_count()
  */
 void Session::notify_cap_release(size_t n_caps)
 {
-  recall_release_count += n_caps;
-  if (n_caps > 0) {
-    released_at = clock::now();
-    if (recall_count <= recall_release_count) {
-      clear_recalled();
-    }
-  }
+  recall_caps.hit(ceph_clock_now(), -(double)n_caps);
+  release_caps.hit(ceph_clock_now(), n_caps);
 }
 
 /**
@@ -911,25 +906,13 @@ void Session::notify_cap_release(size_t n_caps)
  * in order to generate health metrics if the session doesn't see
  * a commensurate number of calls to ::notify_cap_release
  */
-uint64_t Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
 {
   const auto num_caps = caps.size();
-  ceph_assert(new_limit < num_caps);
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
   const auto count = num_caps-new_limit;
-
-  /* Entering recall phase, set up counters so we can later judge whether the
-   * client has respected the recall request. Update only if client has not
-   * released caps from a previous recall.
-   */
-
   uint64_t new_change;
   if (recall_limit != new_limit) {
-    const auto now = clock::now();
-    recalled_at = now;
-    assert (new_limit < num_caps);  // Behaviour of Server::recall_client_state
-    recall_count = count;
-    recall_release_count = 0;
-    recall_limit = new_limit;
     new_change = count;
   } else {
     new_change = 0; /* no change! */
@@ -940,17 +923,11 @@ uint64_t Session::notify_recall_sent(const size_t new_limit)
    * session that is not releasing caps (i.e. allow the session counter to
    * throttle future RECALL messages).
    */
-  cap_recalled.hit(count);
+  recall_caps_throttle.hit(ceph_clock_now(), count);
+  recall_caps.hit(ceph_clock_now(), count);
   return new_change;
 }
 
-void Session::clear_recalled()
-{
-  recall_count = 0;
-  recall_release_count = 0;
-  recall_limit = 0;
-}
-
 void Session::set_client_metadata(const client_metadata_t& meta)
 {
   info.client_metadata = meta;
@@ -1074,12 +1051,27 @@ void SessionMap::handle_conf_change(const struct md_config_t *conf,
     auto d = g_conf->get_val<double>("mds_recall_max_decay_rate");
     if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
       for (const auto &session : *(it->second)) {
-        session->cap_recalled = DecayCounter(ceph_clock_now(), d);
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+    if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps_throttle = DecayCounter(ceph_clock_now(), d);
+      }
+    }
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf->get_val<double>("mds_recall_warning_decay_rate");
+    if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
     if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
       for (const auto &session : *(it->second)) {
-        session->cap_recalled = DecayCounter(ceph_clock_now(), d); 
+        session->recall_caps = DecayCounter(ceph_clock_now(), d);
+        session->release_caps = DecayCounter(ceph_clock_now(), d);
       }
     }
   }
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index c72b98fc0256a..8e9e1ee1c2a98 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -113,8 +113,15 @@ private:
   mutable DecayCounter load_avg;
   DecayRate    load_avg_rate;
 
-  // caps being recalled recently by this session
-  DecayCounter cap_recalled;
+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  mutable DecayCounter recall_caps;
+  // caps that have been released
+  mutable DecayCounter release_caps;
+  // throttle on caps recalled
+  mutable DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
 
   // session start time -- used to track average session time
   // note that this is initialized in the constructor rather
@@ -149,13 +156,6 @@ public:
   void set_client_metadata(const client_metadata_t& meta);
   const std::string& get_human_name() const {return human_name;}
 
-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = time::min();  // When was I asked to SESSION_RECALL?
-  time released_at = time::min(); // When did the session last release caps?
-  uint32_t recall_count = 0;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count = 0;  // How many caps have I actually revoked?
-  uint32_t recall_limit = 0;  // New limit in SESSION_RECALL
-
   session_info_t info;                         ///< durable bits
 
   MDSAuthCaps auth_caps;
@@ -171,11 +171,16 @@ public:
   interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos
 
   void notify_cap_release(size_t n_caps);
-  uint64_t notify_recall_sent(const size_t new_limit);
-  auto cap_recalled_counter() const {
-    return cap_recalled.get(ceph_clock_now());
+  uint64_t notify_recall_sent(size_t new_limit);
+  auto get_recall_caps_throttle() const {
+    return recall_caps_throttle.get(ceph_clock_now());
+  }
+  auto get_recall_caps() const {
+    return recall_caps.get(ceph_clock_now());
+  }
+  auto get_release_caps() const {
+    return release_caps.get(ceph_clock_now());
   }
-  void clear_recalled();
 
   inodeno_t next_ino() const {
     if (info.prealloc_inos.empty())
@@ -378,7 +383,9 @@ public:
 
   Session() = delete;
   Session(ConnectionRef con) :
-    cap_recalled(g_conf->get_val<double>("mds_recall_max_decay_rate")),
+    recall_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(ceph_clock_now(), g_conf->get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(ceph_clock_now(), g_conf->get_val<double>("mds_recall_max_decay_rate")),
     birth_time(clock::now()),
     auth_caps(g_ceph_context),
     item_session_list(this),