]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: warn when clients are not advancing their oldest_client_tid
authorYan, Zheng <zyan@redhat.com>
Thu, 16 Apr 2015 06:51:35 +0000 (14:51 +0800)
committerYan, Zheng <zyan@redhat.com>
Tue, 21 Apr 2015 15:14:15 +0000 (23:14 +0800)
Fixes: #10657
Signed-off-by: Yan, Zheng <zyan@redhat.com>
src/common/config_opts.h
src/mds/Beacon.cc
src/mds/Server.cc
src/mds/SessionMap.h
src/messages/MMDSBeacon.h

index 6218bb0b36759924fe10068dcccaf15228bb52a8..60afe9a98dc49d8593a0955e14e8c60969535772 100644 (file)
@@ -450,6 +450,8 @@ OPTION(mds_snap_min_uid, OPT_U32, 0) // The minimum UID required to create a sna
 OPTION(mds_snap_max_uid, OPT_U32, 65536) // The maximum UID allowed to create a snapshot
 OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot
 OPTION(mds_verify_backtrace, OPT_U32, 1)
+// detect clients which aren't trimming completed requests
+OPTION(mds_max_completed_requests, OPT_U32, 100000)
 
 OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash
 OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE, 5)
index e56a6362083e338f900e4dbb61565a690e2cb89a..35265db52303f3c3729aada0d554f1678266e639 100644 (file)
@@ -343,6 +343,8 @@ void Beacon::notify_health(MDS const *mds)
 
   // Detect clients failing to generate cap releases from CEPH_SESSION_RECALL_STATE
   // messages. May be due to buggy client or resource-hogging application.
+  //
+  // Detect clients failing to advance their old_client_tid
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
@@ -350,6 +352,7 @@ void Beacon::notify_health(MDS const *mds)
     cutoff -= g_conf->mds_recall_state_timeout;
 
     std::list<MDSHealthMetric> late_recall_metrics;
+    std::list<MDSHealthMetric> large_completed_requests_metrics;
     for (set<Session*>::iterator i = sessions.begin(); i != sessions.end(); ++i) {
       Session *session = *i;
       if (!session->recalled_at.is_zero()) {
@@ -359,7 +362,7 @@ void Beacon::notify_health(MDS const *mds)
         if (session->recalled_at < cutoff) {
           dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
           std::ostringstream oss;
-        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+         oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
           MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
           m.metadata["client_id"] = session->info.inst.name.num();
           late_recall_metrics.push_back(m);
@@ -367,6 +370,14 @@ void Beacon::notify_health(MDS const *mds)
           dout(20) << "  within timeout " << session->recalled_at << " vs. " << cutoff << dendl;
         }
       }
+      if (session->get_num_trim_requests_warnings() > 0 &&
+         session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) {
+       std::ostringstream oss;
+       oss << "Client " << session->get_human_name() << " failing to advance its oldest_client_tid";
+       MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
+       m.metadata["client_id"] = session->info.inst.name.num();
+       large_completed_requests_metrics.push_back(m);
+      }
     }
 
     if (late_recall_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
@@ -380,6 +391,18 @@ void Beacon::notify_health(MDS const *mds)
       health.metrics.push_back(m);
       late_recall_metrics.clear();
     }
+
+    if (large_completed_requests_metrics.size() <= (size_t)g_conf->mds_health_summarize_threshold) {
+      health.metrics.splice(health.metrics.end(), large_completed_requests_metrics);
+    } else {
+      std::ostringstream oss;
+      oss << "Many clients (" << large_completed_requests_metrics.size()
+       << ") failing to advance their oldest_client_tid";
+      MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str());
+      m.metadata["client_count"] = large_completed_requests_metrics.size();
+      health.metrics.push_back(m);
+      large_completed_requests_metrics.clear();
+    }
   }
 }
 
index 6097fbb9f94c4b9381ce5c4547287a973d9e57f2..3e67f96a4cf70fcdc2b1d0a20b713bafdd9aeb8d 100644 (file)
@@ -1356,6 +1356,22 @@ void Server::handle_client_request(MClientRequest *req)
       // Sessions 'completed_requests' was dirtied, mark it to be
       // potentially flushed at segment expiry.
       mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
+
+      if (session->get_num_trim_requests_warnings() > 0 &&
+         session->get_num_completed_requests() * 2 < g_conf->mds_max_completed_requests)
+       session->reset_num_trim_requests_warnings();
+    } else {
+      if (session->get_num_completed_requests() >=
+         (g_conf->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
+       session->inc_num_trim_requests_warnings();
+       stringstream ss;
+       ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
+          << req->get_oldest_client_tid() << "), "
+          << session->get_num_completed_requests()
+          << " completed requests recorded in session\n";
+       mds->clog->warn() << ss.str();
+       dout(20) << __func__ << " " << ss.str() << dendl;
+      }
     }
   }
 
index b64871c6d960eac02715fd5ff52024a4f0bb49db..f1ffb08c73b13a386b917f0d2778e45b4f9d3003 100644 (file)
@@ -194,9 +194,6 @@ private:
   version_t cap_push_seq;        // cap push seq #
   map<version_t, list<MDSInternalContextBase*> > waitfor_flush; // flush session messages
 
-  // Has completed_requests been modified since the last time we
-  // wrote this session out?
-  bool completed_requests_dirty;
 public:
   xlist<Capability*> caps;     // inodes with caps; front=most recently used
   xlist<ClientLease*> leases;  // metadata leases to clients
@@ -231,8 +228,11 @@ public:
 
   // -- completed requests --
 private:
+  // Has completed_requests been modified since the last time we
+  // wrote this session out?
+  bool completed_requests_dirty;
 
-
+  unsigned num_trim_requests_warnings;
 public:
   void add_completed_request(ceph_tid_t t, inodeno_t created) {
     info.completed_requests[t] = created;
@@ -261,6 +261,11 @@ public:
     return true;
   }
 
+  unsigned get_num_completed_requests() const { return info.completed_requests.size(); }
+  unsigned get_num_trim_requests_warnings() { return num_trim_requests_warnings; }
+  void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; }
+  void reset_num_trim_requests_warnings() { num_trim_requests_warnings = 0; }
+
   bool has_dirty_completed_requests() const
   {
     return completed_requests_dirty;
@@ -278,8 +283,9 @@ public:
     connection(NULL), item_session_list(this),
     requests(0),  // member_offset passed to front() manually
     cap_push_seq(0),
+    lease_seq(0),
     completed_requests_dirty(false),
-    lease_seq(0) { }
+    num_trim_requests_warnings(0) { }
   ~Session() {
     assert(!item_session_list.is_on_list());
     while (!preopen_out_queue.empty()) {
index 48d5d04e561146537d464437ea36b686a6c00eb9..ab53760ace4ecd879bed0cec47a98acb2f53a6ba 100644 (file)
@@ -36,7 +36,9 @@ enum mds_metric_t {
   MDS_HEALTH_CLIENT_RECALL,
   MDS_HEALTH_CLIENT_LATE_RELEASE,
   MDS_HEALTH_CLIENT_RECALL_MANY,
-  MDS_HEALTH_CLIENT_LATE_RELEASE_MANY
+  MDS_HEALTH_CLIENT_LATE_RELEASE_MANY,
+  MDS_HEALTH_CLIENT_OLDEST_TID,
+  MDS_HEALTH_CLIENT_OLDEST_TID_MANY,
 };
 
 /**