]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: collapse numerous client warnings into one
authorJohn Spray <john.spray@redhat.com>
Mon, 8 Sep 2014 15:05:17 +0000 (16:05 +0100)
committerJohn Spray <john.spray@redhat.com>
Wed, 17 Sep 2014 10:25:03 +0000 (11:25 +0100)
...to avoid sending O(N_clients) sized beacons.

Fixes: #9375
Signed-off-by: John Spray <john.spray@redhat.com>
src/common/config_opts.h
src/mds/Beacon.cc
src/messages/MMDSBeacon.h

index 132c9a971eb2ac7f76e7ca055fda0bf0476ae2f7..3a09b545cc68bf7ad10edddf567877428636080f 100644 (file)
@@ -320,6 +320,7 @@ OPTION(mds_revoke_cap_timeout, OPT_FLOAT, 60)    // detect clients which aren't
 OPTION(mds_recall_state_timeout, OPT_FLOAT, 60)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30)    // detecting freeze tree deadlock
 OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session
+OPTION(mds_health_summarize_threshold, OPT_INT, 10) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT, 45)  // seconds to wait for clients during mds restart
              //  make it (mds_session_timeout - mds_beacon_grace)
 OPTION(mds_tick_interval, OPT_FLOAT, 5)
index 2256f320d1abd8aab63090eea396b0636aff4a23..af36ea708df6b670a3db9931de8eef37652ed9fb 100644 (file)
@@ -253,54 +253,87 @@ void Beacon::notify_health(MDS const *mds)
 
   // Detect MDS_HEALTH_TRIM condition
   // Arbitrary factor of 2, indicates MDS is not trimming promptly
-  if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) {
-    std::ostringstream oss;
-    oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
-      << "/" << g_conf->mds_log_max_segments << ")";
-
-    MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
-    m.metadata["num_segments"] = mds->mdlog->get_num_segments();
-    m.metadata["max_segments"] = g_conf->mds_log_max_segments;
-    health.metrics.push_back(m);
+  {
+    if (mds->mdlog->get_num_segments() > (size_t)(g_conf->mds_log_max_segments * 2)) {
+      std::ostringstream oss;
+      oss << "Behind on trimming (" << mds->mdlog->get_num_segments()
+        << "/" << g_conf->mds_log_max_segments << ")";
+
+      MDSHealthMetric m(MDS_HEALTH_TRIM, HEALTH_WARN, oss.str());
+      m.metadata["num_segments"] = mds->mdlog->get_num_segments();
+      m.metadata["max_segments"] = g_conf->mds_log_max_segments;
+      health.metrics.push_back(m);
+    }
   }
 
   // Detect clients failing to respond to modifications to capabilities in
   // CLIENT_CAPS messages.
-  std::list<client_t> late_clients;
-  mds->locker->get_late_revoking_clients(&late_clients);
-  for (std::list<client_t>::iterator i = late_clients.begin();
-          i != late_clients.end(); ++i) {
-    std::ostringstream oss;
-    oss << "client." << *i << " failing to respond to capability release";
-    MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
-    m.metadata["client_id"] = stringify(i->v);
-    health.metrics.push_back(m);
+  {
+
+    std::list<client_t> late_clients;
+    mds->locker->get_late_revoking_clients(&late_clients);
+    std::list<MDSHealthMetric> late_cap_metrics;
+
+    for (std::list<client_t>::iterator i = late_clients.begin(); i != late_clients.end(); ++i) {
+      std::ostringstream oss;
+      oss << "client." << *i << " failing to respond to capability release";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE, HEALTH_WARN, oss.str());
+      m.metadata["client_id"] = stringify(i->v);
+      late_cap_metrics.push_back(m);
+    }
+
+    if (late_cap_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
+      health.metrics.splice(health.metrics.end(), late_cap_metrics);
+    } else {
+      std::ostringstream oss;
+      oss << "Many clients (" << late_cap_metrics.size()
+          << ") failing to respond to capability release";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_LATE_RELEASE_MANY, HEALTH_WARN, oss.str());
+      m.metadata["client_count"] = late_cap_metrics.size();
+      health.metrics.push_back(m);
+      late_cap_metrics.clear();
+    }
   }
 
-  // Detect clients failing to generate cap releases from SESSION_RECALL messages
-  // May be due to buggy client or resource-hogging application.
-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
-  utime_t cutoff = ceph_clock_now(g_ceph_context);
-  cutoff -= g_conf->mds_recall_state_timeout;
-
-  for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
-    Session *session = *i;
-    if (!session->recalled_at.is_zero()) {
-      dout(20) << "Session servicing RECALL " << session->info.inst
-        << ": " << session->recalled_at << " " << session->recall_release_count
-        << "/" << session->recall_count << dendl;
-      if (session->recalled_at < cutoff) {
-        dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
-        std::ostringstream oss;
+  // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
+  // messages. May be due to buggy client or resource-hogging application.
+  {
+    set<Session*> sessions;
+    mds->sessionmap.get_client_session_set(sessions);
+    utime_t cutoff = ceph_clock_now(g_ceph_context);
+    cutoff -= g_conf->mds_recall_state_timeout;
+
+    std::list<MDSHealthMetric> late_recall_metrics;
+    for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
+      Session *session = *i;
+      if (!session->recalled_at.is_zero()) {
+        dout(20) << "Session servicing RECALL " << session->info.inst
+          << ": " << session->recalled_at << " " << session->recall_release_count
+          << "/" << session->recall_count << dendl;
+        if (session->recalled_at < cutoff) {
+          dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+          std::ostringstream oss;
         oss << "Client " << session->info.inst.name.num() << " failing to respond to cache pressure";
-        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-        m.metadata["client_id"] = session->info.inst.name.num();
-        health.metrics.push_back(m);
-      } else {
-        dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+          m.metadata["client_id"] = session->info.inst.name.num();
+          late_recall_metrics.push_back(m);
+        } else {
+          dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
+        }
       }
     }
+
+    if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
+      health.metrics.splice(health.metrics.end(), late_recall_metrics);
+    } else {
+      std::ostringstream oss;
+      oss << "Many clients (" << late_recall_metrics.size()
+          << ") failing to respond to cache pressure";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL_MANY, HEALTH_WARN, oss.str());
+      m.metadata["client_count"] = late_recall_metrics.size();
+      health.metrics.push_back(m);
+      late_recall_metrics.clear();
+    }
   }
 }
 
index 6fc41392b84bef72d219bd731bb728321bf8839a..8c103b17534730a543db3b2bbaabf097ed6b2d9a 100644 (file)
  */
 enum mds_metric_t {
   MDS_HEALTH_NULL = 0,
-  MDS_HEALTH_TRIM = 1,
-  MDS_HEALTH_CLIENT_RECALL = 2,
-  MDS_HEALTH_CLIENT_LATE_RELEASE = 3
+  MDS_HEALTH_TRIM,
+  MDS_HEALTH_CLIENT_RECALL,
+  MDS_HEALTH_CLIENT_LATE_RELEASE,
+  MDS_HEALTH_CLIENT_RECALL_MANY,
+  MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
 };
 
 /**