From 92154c80a49cad7f77d562d93f7ad1b4f4a653fe Mon Sep 17 00:00:00 2001 From: Zhi Zhang Date: Wed, 15 Jun 2016 11:28:34 +0800 Subject: [PATCH] Ceph status outputs mds slow request for better monitoring Signed-off-by: Zhi Zhang --- src/common/TrackedOp.cc | 12 ++++++++---- src/common/TrackedOp.h | 2 +- src/mds/Beacon.cc | 13 +++++++++++++ src/mds/MDSRank.cc | 7 ++++++- src/mds/MDSRank.h | 4 ++++ src/messages/MMDSBeacon.h | 3 ++- 6 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc index 2554c0921edd..28458ad54bdf 100644 --- a/src/common/TrackedOp.cc +++ b/src/common/TrackedOp.cc @@ -173,7 +173,7 @@ void OpTracker::unregister_inflight_op(TrackedOp *i) } } -bool OpTracker::check_ops_in_flight(std::vector &warning_vector) +bool OpTracker::check_ops_in_flight(std::vector &warning_vector, int *slow) { RWLock::RLocker l(lock); if (!tracking_enabled) @@ -214,7 +214,11 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) //store summary message warning_vector.push_back(""); - int slow = 0; // total slow + int _slow = 0; // total slow + if (!slow) + slow = &_slow; + else + *slow = _slow; // start from 0 anyway int warned = 0; // total logged for (uint32_t iter = 0; iter < num_optracker_shards; iter++) { ShardedTrackingData* sdata = sharded_in_flight_list[iter]; @@ -224,7 +228,7 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) continue; xlist::iterator i = sdata->ops_in_flight_sharded.begin(); while (!i.end() && (*i)->get_initiated() < too_old) { - slow++; + (*slow)++; // exponential backoff of warning intervals if (warned < log_threshold && @@ -252,7 +256,7 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector) // off, we will stay silent. if (warned > 0) { stringstream ss; - ss << slow << " slow requests, " << warned << " included below; oldest blocked for > " + ss << *slow << " slow requests, " << warned << " included below; oldest blocked for > " << oldest_secs << " secs"; warning_vector[0] = ss.str(); } diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h index 9d0ff884a89e..ddaaa522e872 100644 --- a/src/common/TrackedOp.h +++ b/src/common/TrackedOp.h @@ -120,7 +120,7 @@ public: * with a warning string for each old Op. * @return True if there are any Ops to warn on, false otherwise. */ - bool check_ops_in_flight(std::vector &warning_strings); + bool check_ops_in_flight(std::vector &warning_strings, int *slow = NULL); void mark_event(TrackedOp *op, const string &evt, utime_t time = ceph_clock_now(g_ceph_context)); diff --git a/src/mds/Beacon.cc b/src/mds/Beacon.cc index 9a07b915974f..a25a73e27674 100644 --- a/src/mds/Beacon.cc +++ b/src/mds/Beacon.cc @@ -444,6 +444,19 @@ void Beacon::notify_health(MDSRank const *mds) } } + // Detect MDS_HEALTH_SLOW_REQUEST condition + { + int slow = mds->get_mds_slow_req_count(); + dout(20) << slow << " slow request found" << dendl; + if (slow) { + std::ostringstream oss; + oss << slow << " slow requests are blocked > " << g_conf->mds_op_complaint_time << " sec"; + + MDSHealthMetric m(MDS_HEALTH_SLOW_REQUEST, HEALTH_WARN, oss.str()); + health.metrics.push_back(m); + } + } + // Report a health warning if we are readonly if (mds->mdcache->is_readonly()) { MDSHealthMetric m(MDS_HEALTH_READ_ONLY, HEALTH_WARN, diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index feb48970e1fd..122a6b21b6f2 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -66,6 +66,7 @@ MDSRank::MDSRank( stopping(false), progress_thread(this), dispatch_depth(0), hb(NULL), last_tid(0), osd_epoch_barrier(0), beacon(beacon_), + mds_slow_req_count(0), last_client_mdsmap_bcast(0), messenger(msgr), monc(monc_), respawn_hook(respawn_hook_), @@ -2388,13 +2389,17 @@ void MDSRank::create_logger() void MDSRank::check_ops_in_flight() { vector warnings; - if (op_tracker.check_ops_in_flight(warnings)) { + int slow = 0; + if (op_tracker.check_ops_in_flight(warnings, &slow)) { for (vector::iterator i = warnings.begin(); i != warnings.end(); ++i) { clog->warn() << *i; } } + + // set mds slow request count + mds_slow_req_count = slow; return; } diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index 4da260d7ac9e..4f19471a9f1f 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -248,6 +248,8 @@ class MDSRank { * Emit clog warnings for any ops reported as warnings by optracker */ void check_ops_in_flight(); + + int mds_slow_req_count; /** * Share MDSMap with clients @@ -361,6 +363,8 @@ class MDSRank { MDSMap *get_mds_map() { return mdsmap; } int get_req_rate() { return logger->get(l_mds_request); } + + int get_mds_slow_req_count() const { return mds_slow_req_count; } void dump_status(Formatter *f) const; diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h index 727aaad71474..51127410c647 100644 --- a/src/messages/MMDSBeacon.h +++ b/src/messages/MMDSBeacon.h @@ -38,7 +38,8 @@ enum mds_metric_t { MDS_HEALTH_CLIENT_OLDEST_TID, MDS_HEALTH_CLIENT_OLDEST_TID_MANY, MDS_HEALTH_DAMAGE, - MDS_HEALTH_READ_ONLY + MDS_HEALTH_READ_ONLY, + MDS_HEALTH_SLOW_REQUEST }; /** -- 2.47.3