From e1bd216ed971f7b970c33a225ad099f96a63440d Mon Sep 17 00:00:00 2001 From: "yite.gu" Date: Mon, 15 Aug 2022 19:15:48 +0800 Subject: [PATCH] osd: add slow ops count into perf dump We can know that how many times dose the osd slow op occur by slow ops count. On the other hand, slow op randomly appear in osds, we can find osd what have most times slow op. Usually, these osd also have most pg number. slow ops count can help find target osd. Signed-off-by: Yite Gu --- src/common/TrackedOp.cc | 5 ++++- src/common/TrackedOp.h | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc index 217d571d5cfc7..d63bdb8f9a574 100644 --- a/src/common/TrackedOp.cc +++ b/src/common/TrackedOp.cc @@ -88,8 +88,10 @@ void OpHistory::_insert_delayed(const utime_t& now, TrackedOpRef op) double opduration = op->get_duration(); duration.insert(make_pair(opduration, op)); arrived.insert(make_pair(op->get_initiated(), op)); - if (opduration >= history_slow_op_threshold.load()) + if (opduration >= history_slow_op_threshold.load()) { slow_op.insert(make_pair(op->get_initiated(), op)); + logger->inc(l_osd_slow_op_count); + } cleanup(now); } @@ -156,6 +158,7 @@ struct ShardedTrackingData { OpTracker::OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards): seq(0), + history(cct_), num_optracker_shards(num_shards), complaint_time(0), log_threshold(0), tracking_enabled(tracking), diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h index 03493a22e24df..0ff7430b372b5 100644 --- a/src/common/TrackedOp.h +++ b/src/common/TrackedOp.h @@ -51,8 +51,14 @@ public: void *entry() override; }; +enum { + l_osd_slow_op_first = 1000, + l_osd_slow_op_count, + l_osd_slow_op_last, +}; class OpHistory { + CephContext* cct = nullptr; std::set > arrived; std::set > duration; std::set > slow_op; @@ -65,15 +71,28 @@ class OpHistory { std::atomic_bool shutdown{false}; OpHistoryServiceThread opsvc; friend class OpHistoryServiceThread; + std::unique_ptr logger; public: - OpHistory() : opsvc(this) { + OpHistory(CephContext *c) : cct(c), opsvc(this) { + PerfCountersBuilder b(cct, "osd-slow-ops", + l_osd_slow_op_first, l_osd_slow_op_last); + b.add_u64_counter(l_osd_slow_op_count, "slow_ops_count", + "Number of operations taking over ten second"); + + logger.reset(b.create_perf_counters()); + cct->get_perfcounters_collection()->add(logger.get()); + opsvc.create("OpHistorySvc"); } ~OpHistory() { ceph_assert(arrived.empty()); ceph_assert(duration.empty()); ceph_assert(slow_op.empty()); + if(logger) { + cct->get_perfcounters_collection()->remove(logger.get()); + logger.reset(); + } } void insert(const utime_t& now, TrackedOpRef op) { -- 2.39.5