From 3025b5f09a054ee96619874f17f9216eaaa465a4 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 8 Sep 2014 16:05:17 +0100 Subject: [PATCH] mds: collapse numerous client warnings into one ...to avoid sending O(N_clients) sized beacons. Fixes: #9375 Signed-off-by: John Spray --- src/common/config_opts.h | 1 + src/mds/Beacon.cc | 111 ++++++++++++++++++++++++-------------- src/messages/MMDSBeacon.h | 8 +-- 3 files changed, 78 insertions(+), 42 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 132c9a971eb2a..3a09b545cc68b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -320,6 +320,7 @@ OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60) // detect clients which aren't OPTION(mds_recall_state_timeout, OPT_FLOAT, 60) // detect clients which aren't trimming caps OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // detecting freeze tree deadlock OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session +OPTION(mds_health_summarize_threshold, OPT_INT, 10) // collapse N-client health metrics to a single 'many' OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart // make it (mds_session_timeout - mds_beacon_grace) OPTION(mds_tick_interval, OPT_FLOAT, 5) diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 2256f320d1abd..af36ea708df6b 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -253,54 +253,87 @@ void Beacon::notify_health(MDS const *mds) // Detect MDS_HEALTH_TRIM condition // Arbitrary factor of 2, indicates MDS is not trimming promptly - if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) { - std::ostringstream oss; - oss << "Behind on trimming (" << mds->mdlog->get_num_segments() - << "/" << g_conf->mds_log_max_segments << ")"; - - MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str()); - m.metadata["num_segments"] = mds->mdlog->get_num_segments(); - m.metadata["max_segments"] = g_conf->mds_log_max_segments; - health.metrics.push_back(m); + { + if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) { + std::ostringstream oss; + oss << "Behind on trimming (" << mds->mdlog->get_num_segments() + << "/" << g_conf->mds_log_max_segments << ")"; + + MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str()); + m.metadata["num_segments"] = mds->mdlog->get_num_segments(); + m.metadata["max_segments"] = g_conf->mds_log_max_segments; + health.metrics.push_back(m); + } } // Detect clients failing to respond to modifications to capabilities in // CLIENT_CAPS messages. - std::list late_clients; - mds->locker->get_late_revoking_clients(&late_clients); - for (std::list::iterator i = late_clients.begin(); - i != late_clients.end(); ++i) { - std::ostringstream oss; - oss << "client." << *i << " failing to respond to capability release"; - MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str()); - m.metadata["client_id"] = stringify(i->v); - health.metrics.push_back(m); + { + + std::list late_clients; + mds->locker->get_late_revoking_clients(&late_clients); + std::list late_cap_metrics; + + for (std::list::iterator i = late_clients.begin(); i != late_clients.end(); ++i) { + std::ostringstream oss; + oss << "client." << *i << " failing to respond to capability release"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = stringify(i->v); + late_cap_metrics.push_back(m); + } + + if (late_cap_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) { + health.metrics.splice(health.metrics.end(), late_cap_metrics); + } else { + std::ostringstream oss; + oss << "Many clients (" << late_cap_metrics.size() + << ") failing to respond to capability release"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str()); + m.metadata["client_count"] = late_cap_metrics.size(); + health.metrics.push_back(m); + late_cap_metrics.clear(); + } } - // Detect clients failing to generate cap releases from SESSION_RECALL messages - // May be due to buggy client or resource-hogging application. - set sessions; - mds->sessionmap.get_client_session_set(sessions); - utime_t cutoff = ceph_clock_now(g_ceph_context); - cutoff -= g_conf->mds_recall_state_timeout; - - for (set::iterator i = sessions.begin(); i != sessions.end(); ++i) { - Session *session = *i; - if (!session->recalled_at.is_zero()) { - dout(20) << "Session servicing RECALL " << session->info.inst - << ": " << session->recalled_at << " " << session->recall_release_count - << "/" << session->recall_count << dendl; - if (session->recalled_at < cutoff) { - dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl; - std::ostringstream oss; + // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE + // messages. May be due to buggy client or resource-hogging application. + { + set sessions; + mds->sessionmap.get_client_session_set(sessions); + utime_t cutoff = ceph_clock_now(g_ceph_context); + cutoff -= g_conf->mds_recall_state_timeout; + + std::list late_recall_metrics; + for (set::iterator i = sessions.begin(); i != sessions.end(); ++i) { + Session *session = *i; + if (!session->recalled_at.is_zero()) { + dout(20) << "Session servicing RECALL " << session->info.inst + << ": " << session->recalled_at << " " << session->recall_release_count + << "/" << session->recall_count << dendl; + if (session->recalled_at < cutoff) { + dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl; + std::ostringstream oss; oss << "Client " << session->info.inst.name.num() << " failing to respond to cache pressure"; - MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); - m.metadata["client_id"] = session->info.inst.name.num(); - health.metrics.push_back(m); - } else { - dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; + MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str()); + m.metadata["client_id"] = session->info.inst.name.num(); + late_recall_metrics.push_back(m); + } else { + dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; + } } } + + if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) { + health.metrics.splice(health.metrics.end(), late_recall_metrics); + } else { + std::ostringstream oss; + oss << "Many clients (" << late_recall_metrics.size() + << ") failing to respond to cache pressure"; + MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str()); + m.metadata["client_count"] = late_recall_metrics.size(); + health.metrics.push_back(m); + late_recall_metrics.clear(); + } } } diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index 6fc41392b84be..8c103b1753473 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -32,9 +32,11 @@ */ enum mds_metric_t { MDS_HEALTH_NULL = 0, - MDS_HEALTH_TRIM = 1, - MDS_HEALTH_CLIENT_RECALL = 2, - MDS_HEALTH_CLIENT_LATE_RELEASE = 3 + MDS_HEALTH_TRIM, + MDS_HEALTH_CLIENT_RECALL, + MDS_HEALTH_CLIENT_LATE_RELEASE, + MDS_HEALTH_CLIENT_RECALL_MANY, + MDS_HEALTH_CLIENT_LATE_RELEASE_MANY }; /** -- 2.39.5