]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: fix false "failing to respond to cache pressure" warning 11861/head
authorYan, Zheng <zyan@redhat.com>
Sat, 8 Oct 2016 07:16:40 +0000 (15:16 +0800)
committerLoic Dachary <ldachary@redhat.com>
Wed, 9 Nov 2016 14:13:42 +0000 (15:13 +0100)
the false warning happens in following sequence of events
- MDS has cache pressure, sends recall state messages to clients
- Client does not trim as many caps as MDS expected. So MDS
  does not reset session->recalled_at
- MDS no longer has cache pressure, it stop sending recall state
  messages to clients.
- Client does not release its caps. So session->recalled_at in
  MDS keeps unchanged

Signed-off-by: Yan, Zheng <zyan@redhat.com>
(cherry picked from commit 51c926a74e5ef478c11ccbcf11c351aa520dde2a)

src/mds/Beacon.cc
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/SessionMap.cc
src/mds/SessionMap.h

index 06020af105325e6e5e9305cc20f72e756f42eac6..b2565fd66227a2add39b2aadb88609b5cbfb34cb 100644 (file)
@@ -384,8 +384,10 @@ void Beacon::notify_health(MDSRank const *mds)
   {
     set<Session*> sessions;
     mds->sessionmap.get_client_session_set(sessions);
+
     utime_t cutoff = ceph_clock_now(g_ceph_context);
     cutoff -= g_conf->mds_recall_state_timeout;
+    utime_t last_recall = mds->mdcache->last_recall_state;
 
     std::list<MDSHealthMetric> late_recall_metrics;
     std::list<MDSHealthMetric> large_completed_requests_metrics;
@@ -395,7 +397,10 @@ void Beacon::notify_health(MDSRank const *mds)
         dout(20) << "Session servicing RECALL " << session->info.inst
           << ": " << session->recalled_at << " " << session->recall_release_count
           << "/" << session->recall_count << dendl;
-        if (session->recalled_at < cutoff) {
+       if (last_recall < cutoff || session->last_recall_sent < last_recall) {
+         dout(20) << "  no longer recall" << dendl;
+         session->clear_recalled_at();
+       } else if (session->recalled_at < cutoff) {
           dout(20) << "  exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl;
           std::ostringstream oss;
          oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
index 32269695e222f25ced8405b3361142b9657ca255..447d2795df89c2f54a90f7e95094b38522aeec1f 100644 (file)
@@ -7316,10 +7316,11 @@ void MDCache::check_memory_usage()
 
   if (num_inodes_with_caps > g_conf->mds_cache_size) {
     float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps;
-    if (ratio < 1.0)
+    if (ratio < 1.0) {
+      last_recall_state = ceph_clock_now(g_ceph_context);
       mds->server->recall_client_state(ratio);
+    }
   }
-
 }
 
 
index 9152c90698544583f802a2b67dded131b34e4bcb..7a221108787a59bc38407bd6c53b38314b996182 100644 (file)
@@ -693,6 +693,8 @@ public:
   void trim_client_leases();
   void check_memory_usage();
 
+  utime_t last_recall_state;
+
   // shutdown
   void shutdown_start();
   void shutdown_check();
index d91713f06f96a306378840836a60edc1f9da5a36..247038a46f8911ec29880b9fb9c4860b1255a2eb 100644 (file)
@@ -771,11 +771,8 @@ void Session::notify_cap_release(size_t n_caps)
 {
   if (!recalled_at.is_zero()) {
     recall_release_count += n_caps;
-    if (recall_release_count >= recall_count) {
-      recalled_at = utime_t();
-      recall_count = 0;
-      recall_release_count = 0;
-    }
+    if (recall_release_count >= recall_count)
+      clear_recalled_at();
   }
 }
 
@@ -790,13 +787,22 @@ void Session::notify_recall_sent(int const new_limit)
   if (recalled_at.is_zero()) {
     // Entering recall phase, set up counters so we can later
     // judge whether the client has respected the recall request
-    recalled_at = ceph_clock_now(g_ceph_context);
+    recalled_at = last_recall_sent = ceph_clock_now(g_ceph_context);
     assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
     recall_count = caps.size() - new_limit;
     recall_release_count = 0;
+  } else {
+    last_recall_sent = ceph_clock_now(g_ceph_context);
   }
 }
 
+void Session::clear_recalled_at()
+{
+  recalled_at = last_recall_sent = utime_t();
+  recall_count = 0;
+  recall_release_count = 0;
+}
+
 void Session::set_client_metadata(map<string, string> const &meta)
 {
   info.client_metadata = meta;
index d03e16c85ac25931bc8bcf174d3a312040fba11f..6ddb603c5b75ee6beab241ac0803f9aaeffa8789 100644 (file)
@@ -123,6 +123,7 @@ public:
 
   // Ephemeral state for tracking progress of capability recalls
   utime_t recalled_at;  // When was I asked to SESSION_RECALL?
+  utime_t last_recall_sent;
   uint32_t recall_count;  // How many caps was I asked to SESSION_RECALL?
   uint32_t recall_release_count;  // How many caps have I actually revoked?
 
@@ -142,6 +143,7 @@ public:
 
   void notify_cap_release(size_t n_caps);
   void notify_recall_sent(int const new_limit);
+  void clear_recalled_at();
 
   inodeno_t next_ino() {
     if (info.prealloc_inos.empty())
@@ -309,7 +311,7 @@ public:
 
   Session() : 
     state(STATE_CLOSED), state_seq(0), importing_count(0),
-    recalled_at(), recall_count(0), recall_release_count(0),
+    recall_count(0), recall_release_count(0),
     auth_caps(g_ceph_context),
     connection(NULL), item_session_list(this),
     requests(0),  // member_offset passed to front() manually