From: Kefu Chai Date: Fri, 27 Oct 2017 08:55:14 +0000 (+0800) Subject: common/OpTracker: refactor check_ops_in_flight() X-Git-Tag: v13.0.1~103^2~5 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=721cff9a971a14c884addf2bc492ab6356b57a09;p=ceph-ci.git common/OpTracker: refactor check_ops_in_flight() * add a function visit_ops_in_flight() so it accepts a std::function, which can be applied to all ops in flight. and is used to count slow ops. the # of slow ops will be sent to mgr. * add a function with_slow_ops_in_flight(), so we can use it to implement command like "dump_slow_ops_in_flight". unlike the existing feature which dumps slow ops to clog periodically, it will dump the slow ops on demand. and backoff the warning interval only when the slow ops are dumped. Signed-off-by: Kefu Chai --- diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc index c89be0b60a3..2948bfe0e96 100644 --- a/src/common/TrackedOp.cc +++ b/src/common/TrackedOp.cc @@ -284,21 +284,19 @@ void OpTracker::unregister_inflight_op(TrackedOp *i) } } -bool OpTracker::check_ops_in_flight(std::vector &warning_vector, int *slow) +bool OpTracker::visit_ops_in_flight(utime_t* oldest_secs, + std::function&& visit) { if (!tracking_enabled) return false; - RWLock::RLocker l(lock); - utime_t now = ceph_clock_now(); - utime_t too_old = now; - too_old -= complaint_time; + const utime_t now = ceph_clock_now(); utime_t oldest_op = now; uint64_t total_ops_in_flight = 0; - for (uint32_t i = 0; i < num_optracker_shards; i++) { - ShardedTrackingData* sdata = sharded_in_flight_list[i]; - assert(NULL != sdata); + RWLock::RLocker l(lock); + for (const auto sdata : sharded_in_flight_list) { + assert(sdata); Mutex::Locker locker(sdata->ops_in_flight_lock_sharded); if (!sdata->ops_in_flight_sharded.empty()) { utime_t oldest_op_tmp = @@ -309,72 +307,102 @@ bool OpTracker::check_ops_in_flight(std::vector &warning_vector, int *sl } total_ops_in_flight += sdata->ops_in_flight_sharded.size(); } - - if (0 == total_ops_in_flight) + if (!total_ops_in_flight) return false; - - utime_t oldest_secs = now - oldest_op; - + *oldest_secs = now - oldest_op; dout(10) << "ops_in_flight.size: " << total_ops_in_flight - << "; oldest is " << oldest_secs + << "; oldest is " << *oldest_secs << " seconds old" << dendl; - if (oldest_secs < complaint_time) + if (*oldest_secs < complaint_time) return false; - warning_vector.reserve(log_threshold + 1); - //store summary message - warning_vector.push_back(""); - - int _slow = 0; // total slow - if (!slow) - slow = &_slow; - else - *slow = _slow; // start from 0 anyway - int warned = 0; // total logged for (uint32_t iter = 0; iter < num_optracker_shards; iter++) { ShardedTrackingData* sdata = sharded_in_flight_list[iter]; assert(NULL != sdata); Mutex::Locker locker(sdata->ops_in_flight_lock_sharded); - if (sdata->ops_in_flight_sharded.empty()) - continue; - auto i = sdata->ops_in_flight_sharded.begin(); - while (i != sdata->ops_in_flight_sharded.end() && - i->get_initiated() < too_old) { - (*slow)++; - - // exponential backoff of warning intervals - if (warned < log_threshold && - (i->get_initiated() + (complaint_time * - i->warn_interval_multiplier)) < now) { - // will warn, increase counter - warned++; - - utime_t age = now - i->get_initiated(); - stringstream ss; - ss << "slow request " << age << " seconds old, received at " - << i->get_initiated() << ": " << i->get_desc() - << " currently " - << (i->current ? i->current : i->state_string()); - warning_vector.push_back(ss.str()); - - // only those that have been shown will backoff - i->warn_interval_multiplier *= 2; - } - ++i; + for (auto& op : sdata->ops_in_flight_sharded) { + if (!visit(op)) + break; } } + return true; +} - // only summarize if we warn about any. if everything has backed - // off, we will stay silent. - if (warned > 0) { +bool OpTracker::with_slow_ops_in_flight(utime_t* oldest_secs, + int* num_slow_ops, + std::function&& on_warn) +{ + const utime_t now = ceph_clock_now(); + auto too_old = now; + too_old -= complaint_time; + int slow = 0; + int warned = 0; + auto check = [&](TrackedOp& op) { + if (op.get_initiated() >= too_old) { + // no more slow ops in flight + return false; + } + slow++; + if (warned >= log_threshold) { + // enough samples of slow ops + return true; + } + auto time_to_complain = (op.get_initiated() + + complaint_time * op.warn_interval_multiplier); + if (time_to_complain >= now) { + // complain later if the op is still in flight + return true; + } + // will warn, increase counter + warned++; + on_warn(op); + return true; + }; + if (visit_ops_in_flight(oldest_secs, check)) { + if (num_slow_ops) { + *num_slow_ops = slow; + } + return true; + } else { + return false; + } +} + +bool OpTracker::check_ops_in_flight(std::string* summary, + std::vector &warnings, + int *num_slow_ops) +{ + const utime_t now = ceph_clock_now(); + auto too_old = now; + too_old -= complaint_time; + int warned = 0; + utime_t oldest_secs; + auto warn_on_slow_op = [&](TrackedOp& op) { + stringstream ss; + utime_t age = now - op.get_initiated(); + ss << "slow request " << age << " seconds old, received at " + << op.get_initiated() << ": " << op.get_desc() + << " currently " + << (op.current ? op.current : op.state_string()); + warnings.push_back(ss.str()); + // only those that have been shown will backoff + op.warn_interval_multiplier *= 2; + }; + int slow = 0; + if (with_slow_ops_in_flight(&oldest_secs, &slow, warn_on_slow_op)) { stringstream ss; - ss << *slow << " slow requests, " << warned << " included below; oldest blocked for > " + ss << slow << " slow requests, " + << warned << " included below; oldest blocked for > " << oldest_secs << " secs"; - warning_vector[0] = ss.str(); + *summary = ss.str(); + if (num_slow_ops) { + *num_slow_ops = slow; + } + return true; + } else { + return false; } - - return warned > 0; } void OpTracker::get_age_ms_histogram(pow2_hist_t *h) diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h index 710b3bdacaf..c4731642c8b 100644 --- a/src/common/TrackedOp.h +++ b/src/common/TrackedOp.h @@ -100,16 +100,41 @@ public: void get_age_ms_histogram(pow2_hist_t *h); + /** + * walk through ops in flight + * + * @param oldest_sec the amount of time since the oldest op was initiated + * @param check a function consuming tracked ops, the function returns + * false if it don't want to be fed with more ops + * @return True if there are any Ops to warn on, false otherwise + */ + bool visit_ops_in_flight(utime_t* oldest_secs, + std::function&& visit); + /** + * walk through slow ops in flight + * + * @param[out] oldest_sec the amount of time since the oldest op was initiated + * @param[out] num_slow_ops total number of slow ops + * @param on_warn a function consuming tracked ops, the function returns + * false if it don't want to be fed with more ops + * @return True if there are any Ops to warn on, false otherwise + */ + bool with_slow_ops_in_flight(utime_t* oldest_secs, + int* num_slow_ops, + std::function&& on_warn); /** * Look for Ops which are too old, and insert warning * strings for each Op that is too old. * - * @param warning_strings A vector reference which is filled + * @param summary[out] a string summarizing slow Ops. + * @param warning_strings[out] A vector reference which is filled * with a warning string for each old Op. + * @param slow[out] total number of slow ops * @return True if there are any Ops to warn on, false otherwise. */ - bool check_ops_in_flight(std::vector &warning_strings, - int *slow = NULL); + bool check_ops_in_flight(std::string* summary, + std::vector &warning_strings, + int* slow = nullptr); void on_shutdown() { history.on_shutdown(); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 51ae378947f..82606474ce8 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2637,13 +2637,13 @@ void MDSRank::create_logger() void MDSRank::check_ops_in_flight() { + string summary; vector warnings; int slow = 0; - if (op_tracker.check_ops_in_flight(warnings, &slow)) { - for (vector::iterator i = warnings.begin(); - i != warnings.end(); - ++i) { - clog->warn() << *i; + if (op_tracker.check_ops_in_flight(&summary, warnings, &slow)) { + clog->warn() << summary; + for (const auto& warning : warnings) { + clog->warn() << warning; } } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 0c72845e44a..dee9f7ab1eb 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5025,12 +5025,12 @@ void OSD::tick_without_osd_lock() void OSD::check_ops_in_flight() { + string summary; vector warnings; - if (op_tracker.check_ops_in_flight(warnings)) { - for (vector::iterator i = warnings.begin(); - i != warnings.end(); - ++i) { - clog->warn() << *i; + if (op_tracker.check_ops_in_flight(&summary, warnings)) { + clog->warn() << summary; + for (const auto& warning : warnings) { + clog->warn() << warning; } } }