From: Yan, Zheng Date: Sun, 7 Jun 2015 08:30:59 +0000 (+0800) Subject: mds: warn if client does not advance its oldest flush tid X-Git-Tag: v9.1.0~57^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F4791%2Fhead;p=ceph.git mds: warn if client does not advance its oldest flush tid Signed-off-by: Yan, Zheng --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index b21f69a788ba..7c5133a63742 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -475,6 +475,7 @@ OPTION(mds_snap_max_uid, OPT_U32, 65536) // The maximum UID allowed to create a OPTION(mds_snap_rstat, OPT_BOOL, false) // enable/disbale nested stat for snapshot OPTION(mds_verify_backtrace, OPT_U32, 1) // detect clients which aren't trimming completed requests +OPTION(mds_max_completed_flushes, OPT_U32, 100000) OPTION(mds_max_completed_requests, OPT_U32, 100000) OPTION(mds_action_on_write_error, OPT_U32, 1) // 0: ignore; 1: force readonly; 2: crash diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 1312758435f0..ea34ded8a13d 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -396,10 +396,12 @@ void Beacon::notify_health(MDSRank const *mds) dout(20) << " within timeout " << session->recalled_at << " vs. " << cutoff << dendl; } } - if (session->get_num_trim_requests_warnings() > 0 && - session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) { + if ((session->get_num_trim_requests_warnings() > 0 && + session->get_num_completed_requests() >= g_conf->mds_max_completed_requests) || + (session->get_num_trim_flushes_warnings() > 0 && + session->get_num_completed_flushes() >= g_conf->mds_max_completed_flushes)) { std::ostringstream oss; - oss << "Client " << session->get_human_name() << " failing to advance its oldest_client_tid"; + oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid"; MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str()); m.metadata["client_id"] = session->info.inst.name.num(); large_completed_requests_metrics.push_back(m); @@ -423,7 +425,7 @@ void Beacon::notify_health(MDSRank const *mds) } else { std::ostringstream oss; oss << "Many clients (" << large_completed_requests_metrics.size() - << ") failing to advance their oldest_client_tid"; + << ") failing to advance their oldest client/flush tid"; MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID_MANY, HEALTH_WARN, oss.str()); m.metadata["client_count"] = large_completed_requests_metrics.size(); health.metrics.push_back(m); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 3849ce464da0..fb548a56e6d6 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2491,6 +2491,22 @@ void Locker::handle_client_caps(MClientCaps *m) if (m->get_oldest_flush_tid() > 0) { if (session->trim_completed_flushes(m->get_oldest_flush_tid())) { mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name); + + if (session->get_num_trim_flushes_warnings() > 0 && + session->get_num_completed_flushes() * 2 < g_conf->mds_max_completed_flushes) + session->reset_num_trim_flushes_warnings(); + } else { + if (session->get_num_completed_flushes() >= + (g_conf->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) { + session->inc_num_trim_flushes_warnings(); + stringstream ss; + ss << "client." << session->get_client() << " does not advance its oldest_flush_tid (" + << m->get_oldest_flush_tid() << "), " + << session->get_num_completed_flushes() + << " completed flushes recorded in session\n"; + mds->clog->warn() << ss.str(); + dout(20) << __func__ << " " << ss.str() << dendl; + } } } diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index b73246e5baab..cfcfa04a6fa8 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -232,6 +232,7 @@ private: // wrote this session out? bool completed_requests_dirty; + unsigned num_trim_flushes_warnings; unsigned num_trim_requests_warnings; public: void add_completed_request(ceph_tid_t t, inodeno_t created) { @@ -280,6 +281,11 @@ public: return info.completed_flushes.count(tid); } + unsigned get_num_completed_flushes() const { return info.completed_flushes.size(); } + unsigned get_num_trim_flushes_warnings() { return num_trim_flushes_warnings; } + void inc_num_trim_flushes_warnings() { ++num_trim_flushes_warnings; } + void reset_num_trim_flushes_warnings() { num_trim_flushes_warnings = 0; } + unsigned get_num_completed_requests() const { return info.completed_requests.size(); } unsigned get_num_trim_requests_warnings() { return num_trim_requests_warnings; } void inc_num_trim_requests_warnings() { ++num_trim_requests_warnings; } @@ -304,6 +310,7 @@ public: cap_push_seq(0), lease_seq(0), completed_requests_dirty(false), + num_trim_flushes_warnings(0), num_trim_requests_warnings(0) { } ~Session() { assert(!item_session_list.is_on_list());