From ea3bd24b3a1a235ee4d3ef3a7ac4bed42d39e052 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 8 Oct 2016 15:16:40 +0800 Subject: [PATCH] mds: fix false "failing to respond to cache pressure" warning the false warning happens in following sequence of events - MDS has cache pressure, sends recall state messages to clients - Client does not trim as many caps as MDS expected. So MDS does not reset session->recalled_at - MDS no longer has cache pressure, it stop sending recall state messages to clients. - Client does not release its caps. So session->recalled_at in MDS keeps unchanged Signed-off-by: Yan, Zheng (cherry picked from commit 51c926a74e5ef478c11ccbcf11c351aa520dde2a) --- src/mds/Beacon.cc | 7 ++++++- src/mds/MDCache.cc | 5 +++-- src/mds/MDCache.h | 2 ++ src/mds/SessionMap.cc | 18 ++++++++++++------ src/mds/SessionMap.h | 4 +++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 06020af105325..b2565fd66227a 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -384,8 +384,10 @@ void Beacon::notify_health(MDSRank const *mds) { set sessions; mds->sessionmap.get_client_session_set(sessions); + utime_t cutoff = ceph_clock_now(g_ceph_context); cutoff -= g_conf->mds_recall_state_timeout; + utime_t last_recall = mds->mdcache->last_recall_state; std::list late_recall_metrics; std::list large_completed_requests_metrics; @@ -395,7 +397,10 @@ void Beacon::notify_health(MDSRank const *mds) dout(20) << "Session servicing RECALL " << session->info.inst << ": " << session->recalled_at << " " << session->recall_release_count << "/" << session->recall_count << dendl; - if (session->recalled_at < cutoff) { + if (last_recall < cutoff || session->last_recall_sent < last_recall) { + dout(20) << " no longer recall" << dendl; + session->clear_recalled_at(); + } else if (session->recalled_at < cutoff) { dout(20) << " exceeded timeout " << session->recalled_at << " vs. " << cutoff << dendl; std::ostringstream oss; oss << "Client " << session->get_human_name() << " failing to respond to cache pressure"; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 32269695e222f..447d2795df89c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -7316,10 +7316,11 @@ void MDCache::check_memory_usage() if (num_inodes_with_caps > g_conf->mds_cache_size) { float ratio = (float)g_conf->mds_cache_size * .9 / (float)num_inodes_with_caps; - if (ratio < 1.0) + if (ratio < 1.0) { + last_recall_state = ceph_clock_now(g_ceph_context); mds->server->recall_client_state(ratio); + } } - } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 9152c90698544..7a221108787a5 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -693,6 +693,8 @@ public: void trim_client_leases(); void check_memory_usage(); + utime_t last_recall_state; + // shutdown void shutdown_start(); void shutdown_check(); diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index d91713f06f96a..247038a46f891 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -771,11 +771,8 @@ void Session::notify_cap_release(size_t n_caps) { if (!recalled_at.is_zero()) { recall_release_count += n_caps; - if (recall_release_count >= recall_count) { - recalled_at = utime_t(); - recall_count = 0; - recall_release_count = 0; - } + if (recall_release_count >= recall_count) + clear_recalled_at(); } } @@ -790,13 +787,22 @@ void Session::notify_recall_sent(int const new_limit) if (recalled_at.is_zero()) { // Entering recall phase, set up counters so we can later // judge whether the client has respected the recall request - recalled_at = ceph_clock_now(g_ceph_context); + recalled_at = last_recall_sent = ceph_clock_now(g_ceph_context); assert (new_limit < caps.size()); // Behaviour of Server::recall_client_state recall_count = caps.size() - new_limit; recall_release_count = 0; + } else { + last_recall_sent = ceph_clock_now(g_ceph_context); } } +void Session::clear_recalled_at() +{ + recalled_at = last_recall_sent = utime_t(); + recall_count = 0; + recall_release_count = 0; +} + void Session::set_client_metadata(map const &meta) { info.client_metadata = meta; diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index d03e16c85ac25..6ddb603c5b75e 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -123,6 +123,7 @@ public: // Ephemeral state for tracking progress of capability recalls utime_t recalled_at; // When was I asked to SESSION_RECALL? + utime_t last_recall_sent; uint32_t recall_count; // How many caps was I asked to SESSION_RECALL? uint32_t recall_release_count; // How many caps have I actually revoked? @@ -142,6 +143,7 @@ public: void notify_cap_release(size_t n_caps); void notify_recall_sent(int const new_limit); + void clear_recalled_at(); inodeno_t next_ino() { if (info.prealloc_inos.empty()) @@ -309,7 +311,7 @@ public: Session() : state(STATE_CLOSED), state_seq(0), importing_count(0), - recalled_at(), recall_count(0), recall_release_count(0), + recall_count(0), recall_release_count(0), auth_caps(g_ceph_context), connection(NULL), item_session_list(this), requests(0), // member_offset passed to front() manually -- 2.39.5