From: John Spray Date: Thu, 4 Sep 2014 15:47:38 +0000 (+0100) Subject: mds: add a health metric for failure to recall caps X-Git-Tag: v0.86~68^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e6062b8d336c4f68a066f33c9ead89740f16f743;p=ceph.git mds: add a health metric for failure to recall caps Fixes: #9284 Signed-off-by: John Spray --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 8abdb33cea9c..132c9a971eb2 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -317,6 +317,7 @@ OPTION(mds_enforce_unique_name, OPT_BOOL, true) OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60) // detect clients which aren't revoking caps +OPTION(mds_recall_state_timeout, OPT_FLOAT, 60) // detect clients which aren't trimming caps OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // detecting freeze tree deadlock OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 1a295ce96f7e..7f6af82d8301 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -15,11 +15,13 @@ #include "common/dout.h" #include "common/HeartbeatMap.h" +#include "include/stringify.h" #include "messages/MMDSBeacon.h" #include "mon/MonClient.h" #include "mds/MDS.h" #include "mds/MDLog.h" +#include "mds/Locker.h" #include "Beacon.h" @@ -261,5 +263,31 @@ void Beacon::notify_health(MDS const *mds) m.metadata["max_segments"] = g_conf->mds_log_max_segments; health.metrics.push_back(m); } + + // Detect clients failing to generate cap releases from SESSION_RECALL messages + // May be due to buggy client or resource-hogging application. + set sessions; + mds->sessionmap.get_client_session_set(sessions); + utime_t cutoff = ceph_clock_now(g_ceph_context); + cutoff -= g_conf->mds_recall_state_timeout; + + for (set::iterator i = sessions.begin(); i != sessions.end(); ++i) { + Session *session = *i; + if (!session->recalled_at.is_zero()) { + dout(20) << "Session servicing RECALL " << session->info.inst + << ": " << session->recalled_at << " " << session->recall_release_count + << "/" << session->recall_count << dendl; + if (session->recalled_at < cutoff) { + dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl; + std::ostringstream oss; + oss << "Client " << session->info.inst.name.num() << " failing to respond to cache pressure"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = session->info.inst.name.num(); + health.metrics.push_back(m); + } else { + dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; + } + } + } } diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 36bd91e42f21..dbcaacc59199 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -50,6 +50,7 @@ class SimpleLock; class ScatterLock; class LocalLock; class MDCache; +typedef ceph::shared_ptr MDRequestRef; #include "SimpleLock.h" diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index c95af0a3ff0b..45f7e8264f31 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -81,12 +81,13 @@ private: int importing_count; friend class SessionMap; +public: + // Ephemeral state for tracking progress of capability recalls utime_t recalled_at; // When was I asked to SESSION_RECALL? uint32_t recall_count; // How many caps was I asked to SESSION_RECALL? uint32_t recall_release_count; // How many caps have I actually revoked? -public: session_info_t info; ///< durable bits ConnectionRef connection; @@ -333,8 +334,8 @@ public: if (p->second->info.inst.name.is_client()) s.insert(p->second->info.inst.name.num()); } - void get_client_session_set(set& s) { - for (ceph::unordered_map::iterator p = session_map.begin(); + void get_client_session_set(set& s) const { + for (ceph::unordered_map::const_iterator p = session_map.begin(); p != session_map.end(); ++p) if (p->second->info.inst.name.is_client()) diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index 33e6165ced9d..60adeadbe43e 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -32,7 +32,8 @@ */ enum mds_metric_t { MDS_HEALTH_NULL = 0, - MDS_HEALTH_TRIM = 1 + MDS_HEALTH_TRIM = 1, + MDS_HEALTH_CLIENT_RECALL = 2 }; /**