OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes
OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle
OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60) // detect clients which aren't revoking caps
+OPTION(mds_recall_state_timeout, OPT_FLOAT, 60) // detect clients which aren't trimming caps
OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // detecting freeze tree deadlock
OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session
OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart
#include "common/dout.h"
#include "common/HeartbeatMap.h"
+#include "include/stringify.h"
#include "messages/MMDSBeacon.h"
#include "mon/MonClient.h"
#include "mds/MDS.h"
#include "mds/MDLog.h"
+#include "mds/Locker.h"
#include "Beacon.h"
m.metadata["max_segments"] = g_conf->mds_log_max_segments;
health.metrics.push_back(m);
}
+
+ // Detect clients failing to generate cap releases from SESSION_RECALL messages
+ // May be due to buggy client or resource-hogging application.
+ set<Session*> sessions;
+ mds->sessionmap.get_client_session_set(sessions);
+ utime_t cutoff = ceph_clock_now(g_ceph_context);
+ cutoff -= g_conf->mds_recall_state_timeout;
+
+ for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
+ Session *session = *i;
+ if (!session->recalled_at.is_zero()) {
+ dout(20) << "Session servicing RECALL " << session->info.inst
+ << ": " << session->recalled_at << " " << session->recall_release_count
+ << "/" << session->recall_count << dendl;
+ if (session->recalled_at < cutoff) {
+ dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+ std::ostringstream oss;
+ oss << "Client " << session->info.inst.name.num() << " failing to respond to cache pressure";
+ MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+ m.metadata["client_id"] = session->info.inst.name.num();
+ health.metrics.push_back(m);
+ } else {
+ dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+ }
+ }
+ }
}
class ScatterLock;
class LocalLock;
class MDCache;
+typedef ceph::shared_ptr<MDRequestImpl> MDRequestRef;
#include "SimpleLock.h"
int importing_count;
friend class SessionMap;
+public:
+
// Ephemeral state for tracking progress of capability recalls
utime_t recalled_at; // When was I asked to SESSION_RECALL?
uint32_t recall_count; // How many caps was I asked to SESSION_RECALL?
uint32_t recall_release_count; // How many caps have I actually revoked?
-public:
session_info_t info; ///< durable bits
ConnectionRef connection;
if (p->second->info.inst.name.is_client())
s.insert(p->second->info.inst.name.num());
}
- void get_client_session_set(set<Session*>& s) {
- for (ceph::unordered_map<entity_name_t,Session*>::iterator p = session_map.begin();
+ void get_client_session_set(set<Session*>& s) const {
+ for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
p != session_map.end();
++p)
if (p->second->info.inst.name.is_client())
*/
enum mds_metric_t {
MDS_HEALTH_NULL = 0,
- MDS_HEALTH_TRIM = 1
+ MDS_HEALTH_TRIM = 1,
+ MDS_HEALTH_CLIENT_RECALL = 2
};
/**