]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: add a health metric for failure to recall caps
authorJohn Spray <john.spray@redhat.com>
Thu, 4 Sep 2014 15:47:38 +0000 (16:47 +0100)
committerJohn Spray <john.spray@redhat.com>
Mon, 15 Sep 2014 14:05:14 +0000 (15:05 +0100)
Fixes: #9284
Signed-off-by: John Spray <john.spray@redhat.com>
src/common/config_opts.h
src/mds/Beacon.cc
src/mds/Locker.h
src/mds/SessionMap.h
src/messages/MMDSBeacon.h

index 8abdb33cea9ca6963a6cafeb9ae647e458560ebc..132c9a971eb2ac7f76e7ca055fda0bf0476ae2f7 100644 (file)
@@ -317,6 +317,7 @@ OPTION(mds_enforce_unique_name, OPT_BOOL, true)
 OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0)  // how long to blacklist failed nodes
 OPTION(mds_session_timeout, OPT_FLOAT, 60)    // cap bits and leases time out if client idle
 OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60)    // detect clients which aren't revoking caps
+OPTION(mds_recall_state_timeout, OPT_FLOAT, 60)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30)    // detecting freeze tree deadlock
 OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session
 OPTION(mds_reconnect_timeout, OPT_FLOAT, 45)  // seconds to wait for clients during mds restart
index 1a295ce96f7e01b9534fdbfb4c9d115312761bf6..7f6af82d8301455160eae2e0fb47a2790c13df98 100644 (file)
 
 #include "common/dout.h"
 #include "common/HeartbeatMap.h"
+#include "include/stringify.h"
 
 #include "messages/MMDSBeacon.h"
 #include "mon/MonClient.h"
 #include "mds/MDS.h"
 #include "mds/MDLog.h"
+#include "mds/Locker.h"
 
 #include "Beacon.h"
 
@@ -261,5 +263,31 @@ void Beacon::notify_health(MDS const *mds)
     m.metadata["max_segments"] = g_conf->mds_log_max_segments;
     health.metrics.push_back(m);
   }
+
+  // Detect clients failing to generate cap releases from SESSION_RECALL messages
+  // May be due to buggy client or resource-hogging application.
+  set<Session*> sessions;
+  mds->sessionmap.get_client_session_set(sessions);
+  utime_t cutoff = ceph_clock_now(g_ceph_context);
+  cutoff -= g_conf->mds_recall_state_timeout;
+
+  for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
+    Session *session = *i;
+    if (!session->recalled_at.is_zero()) {
+      dout(20) << "Session servicing RECALL " << session->info.inst
+        << ": " << session->recalled_at << " " << session->recall_release_count
+        << "/" << session->recall_count << dendl;
+      if (session->recalled_at < cutoff) {
+        dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->info.inst.name.num() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = session->info.inst.name.num();
+        health.metrics.push_back(m);
+      } else {
+        dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+      }
+    }
+  }
 }
 
index 36bd91e42f2105ce6ede5fe6d7a5763d05a90b39..dbcaacc59199a2e6c964c794d8fe2198c27d3c79 100644 (file)
@@ -50,6 +50,7 @@ class SimpleLock;
 class ScatterLock;
 class LocalLock;
 class MDCache;
+typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
 
 #include "SimpleLock.h"
 
index c95af0a3ff0bc6c26af177909c1e657696cafe33..45f7e8264f31bf6c710644e328dd4d701aa945bc 100644 (file)
@@ -81,12 +81,13 @@ private:
   int importing_count;
   friend class SessionMap;
 
+public:
+
   // Ephemeral state for tracking progress of capability recalls
   utime_t recalled_at;  // When was I asked to SESSION_RECALL?
   uint32_t recall_count;  // How many caps was I asked to SESSION_RECALL?
   uint32_t recall_release_count;  // How many caps have I actually revoked?
 
-public:
   session_info_t info;                         ///< durable bits
 
   ConnectionRef connection;
@@ -333,8 +334,8 @@ public:
       if (p->second->info.inst.name.is_client())
        s.insert(p->second->info.inst.name.num());
   }
-  void get_client_session_set(set<Session*>& s) {
-    for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
+  void get_client_session_set(set<Session*>& s) const {
+    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
         p != session_map.end();
         ++p)
       if (p->second->info.inst.name.is_client())
index 33e6165ced9de61025847eabb3e7d750f5256eb9..60adeadbe43e92d911159c80c59007414e4915db 100644 (file)
@@ -32,7 +32,8 @@
  */
 enum mds_metric_t {
   MDS_HEALTH_NULL = 0,
-  MDS_HEALTH_TRIM = 1
+  MDS_HEALTH_TRIM = 1,
+  MDS_HEALTH_CLIENT_RECALL = 2
 };
 
 /**