From 1b4e416f819a95790320cb63cfa50bb4c3822042 Mon Sep 17 00:00:00 2001 From: Aishwarya Mathuria Date: Tue, 24 Aug 2021 20:25:02 +0530 Subject: [PATCH] osd/scrub: Add scrub duration to pg dump stats Addition of a new column, SCRUB_DURATION, to the pg stats that stores the time taken for a PG scrub. Fixes: https://tracker.ceph.com/issues/52605 Signed-off-by: Aishwarya Mathuria --- qa/standalone/scrub/osd-scrub-test.sh | 37 ++++++++++++++++++++++++++ src/mon/PGMap.cc | 4 +++ src/osd/osd_types.cc | 11 ++++++-- src/osd/osd_types.h | 5 +++- src/osd/scrubber/pg_scrubber.cc | 14 ++++++++++ src/osd/scrubber/pg_scrubber.h | 6 +++++ src/osd/scrubber/scrub_machine.cc | 16 +++++++++++ src/osd/scrubber/scrub_machine.h | 8 +++--- src/osd/scrubber/scrub_machine_lstnr.h | 4 +++ 9 files changed, 99 insertions(+), 6 deletions(-) diff --git a/qa/standalone/scrub/osd-scrub-test.sh b/qa/standalone/scrub/osd-scrub-test.sh index 6cea0b8fcf832..9105474e30f21 100755 --- a/qa/standalone/scrub/osd-scrub-test.sh +++ b/qa/standalone/scrub/osd-scrub-test.sh @@ -454,6 +454,43 @@ function TEST_scrub_permit_time() { teardown $dir || return 1 } +function TEST_pg_dump_scrub_duration() { + local dir=$1 + local poolname=test + local OSDS=3 + local objects=15 + + TESTDATA="testdata.$$" + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=$OSDS || return 1 + run_mgr $dir x || return 1 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + # Create a pool with a single pg + create_pool $poolname 1 1 + wait_for_clean || return 1 + poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }') + + dd if=/dev/urandom of=$TESTDATA bs=1032 count=1 + for i in `seq 1 $objects` + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + local pgid="${poolid}.0" + pg_scrub $pgid || return 1 + + ceph pg $pgid query | jq '.info.stats.scrub_duration' + test "$(ceph pg $pgid query | jq '.info.stats.scrub_duration')" '>' "0" || return 1 + + teardown $dir || return 1 +} + main osd-scrub-test "$@" # Local Variables: diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index cde8f0dd0cd60..cb90535addd22 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -1657,6 +1657,7 @@ void PGMap::dump_pg_stats_plain( tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT); tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT); } for (auto i = pg_stats.begin(); @@ -1698,6 +1699,7 @@ void PGMap::dump_pg_stats_plain( << st.last_deep_scrub << st.last_deep_scrub_stamp << st.snaptrimq_len + << st.scrub_duration << TextTable::endrow; } } @@ -2228,6 +2230,7 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set& pgs) const tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT); tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT); + tab.define_column("SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT); for (auto i = pgs.begin(); i != pgs.end(); ++i) { const pg_stat_t& st = pg_stat.at(*i); @@ -2255,6 +2258,7 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set& pgs) const << actingstr.str() << st.last_scrub_stamp << st.last_deep_scrub_stamp + << st.scrub_duration << TextTable::endrow; } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 4e1a9b6b8ae3e..3bb4dfbab4e4f 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2856,6 +2856,7 @@ void pg_stat_t::dump(Formatter *f) const f->dump_bool("pin_stats_invalid", pin_stats_invalid); f->dump_bool("manifest_stats_invalid", manifest_stats_invalid); f->dump_unsigned("snaptrimq_len", snaptrimq_len); + f->dump_float("scrub_duration", scrub_duration); stats.dump(f); f->open_array_section("up"); for (auto p = up.cbegin(); p != up.cend(); ++p) @@ -2910,7 +2911,7 @@ void pg_stat_t::dump_brief(Formatter *f) const void pg_stat_t::encode(ceph::buffer::list &bl) const { - ENCODE_START(26, 22, bl); + ENCODE_START(27, 22, bl); encode(version, bl); encode(reported_seq, bl); encode(reported_epoch, bl); @@ -2958,6 +2959,7 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const encode(manifest_stats_invalid, bl); encode(avail_no_missing, bl); encode(object_location_counts, bl); + encode(scrub_duration, bl); ENCODE_FINISH(bl); } @@ -3032,6 +3034,9 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) decode(avail_no_missing, bl); decode(object_location_counts, bl); } + if (struct_v >= 27) { + decode(scrub_duration, bl); + } } DECODE_FINISH(bl); } @@ -3064,6 +3069,7 @@ void pg_stat_t::generate_test_instances(list& o) a.last_deep_scrub = eversion_t(13, 14); a.last_deep_scrub_stamp = utime_t(15, 16); a.last_clean_scrub_stamp = utime_t(17, 18); + a.scrub_duration = 0.003; a.snaptrimq_len = 1048576; list l; object_stat_collection_t::generate_test_instances(l); @@ -3137,7 +3143,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r) l.pin_stats_invalid == r.pin_stats_invalid && l.manifest_stats_invalid == r.manifest_stats_invalid && l.purged_snaps == r.purged_snaps && - l.snaptrimq_len == r.snaptrimq_len; + l.snaptrimq_len == r.snaptrimq_len && + l.scrub_duration == r.scrub_duration; } // -- store_statfs_t -- diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 4674085838942..4f65e14d55d9f 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2249,6 +2249,8 @@ struct pg_stat_t { bool pin_stats_invalid:1; bool manifest_stats_invalid:1; + double scrub_duration; + pg_stat_t() : reported_seq(0), reported_epoch(0), @@ -2266,7 +2268,8 @@ struct pg_stat_t { hitset_stats_invalid(false), hitset_bytes_stats_invalid(false), pin_stats_invalid(false), - manifest_stats_invalid(false) + manifest_stats_invalid(false), + scrub_duration(0) { } epoch_t get_effective_last_epoch_clean() const { diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index dce25081a3a78..df7fe95a41db8 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2008,6 +2008,20 @@ PgScrubber::PgScrubber(PG* pg) m_osds->get_nodeid()); } +void PgScrubber::set_scrub_begin_time() { + scrub_begin_stamp = ceph_clock_now(); +} + +void PgScrubber::set_scrub_duration() { + utime_t stamp = ceph_clock_now(); + utime_t duration = stamp - scrub_begin_stamp; + m_pg->recovery_state.update_stats( + [=](auto &history, auto &stats) { + stats.scrub_duration = double(duration); + return true; + }); +} + void PgScrubber::reserve_replicas() { dout(10) << __func__ << dendl; diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 9077bfcf3bcf6..a9845efb1dd5c 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -421,6 +421,12 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { std::string dump_awaited_maps() const final; + void set_scrub_begin_time() final; + + void set_scrub_duration() final; + + utime_t scrub_begin_stamp; + protected: bool state_test(uint64_t m) const { return m_pg->state_test(m); } void state_set(uint64_t m) { m_pg->state_set(m); } diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 4f9ed5e7f8300..d59fbe7e7f1c7 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -84,6 +84,14 @@ NotActive::NotActive(my_context ctx) : my_base(ctx) dout(10) << "-- state -->> NotActive" << dendl; } +sc::result NotActive::react(const StartScrub&) +{ + dout(10) << "NotActive::react(const StartScrub&)" << dendl; + DECLARE_LOCALS; + scrbr->set_scrub_begin_time(); + return transit(); +} + // ----------------------- ReservingReplicas --------------------------------- ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) @@ -440,6 +448,14 @@ sc::result WaitDigestUpdate::react(const DigestUpdate&) return discard_event(); } +sc::result WaitDigestUpdate::react(const ScrubFinished&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl; + scrbr->set_scrub_duration(); + return transit(); +} + ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub) : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub} { diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index f75c5acdc2ecb..f3c8d2ced603a 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -153,12 +153,13 @@ class ScrubMachine : public sc::state_machine { struct NotActive : sc::state { explicit NotActive(my_context ctx); - using reactions = mpl::list, + using reactions = mpl::list, // a scrubbing that was initiated at recovery completion, // and requires no resource reservations: sc::transition, sc::transition, sc::transition>; + sc::result react(const StartScrub&); }; struct ReservingReplicas : sc::state { @@ -310,9 +311,10 @@ struct WaitDigestUpdate : sc::state { explicit WaitDigestUpdate(my_context ctx); using reactions = mpl::list, - sc::transition, - sc::transition>; + sc::custom_reaction, + sc::transition>; sc::result react(const DigestUpdate&); + sc::result react(const ScrubFinished&); }; // ----------------------------- the "replica active" states ----------------------- diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 28745d469d998..72ff4b7fdd55f 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -148,6 +148,10 @@ struct ScrubMachineListener { virtual void unreserve_replicas() = 0; + virtual void set_scrub_begin_time() = 0; + + virtual void set_scrub_duration() = 0; + /** * No new scrub session will start while a scrub was initiate on a PG, * and that PG is trying to acquire replica resources. -- 2.39.5