]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: Add scrub duration to pg dump stats 42977/head
authorAishwarya Mathuria <amathuri@redhat.com>
Tue, 24 Aug 2021 14:55:02 +0000 (20:25 +0530)
committerAishwarya Mathuria <amathuri@redhat.com>
Fri, 1 Oct 2021 07:57:27 +0000 (13:27 +0530)
Addition of a new column, SCRUB_DURATION, to the pg stats that stores the time taken for a PG scrub.

Fixes: https://tracker.ceph.com/issues/52605
Signed-off-by: Aishwarya Mathuria <amathuri@redhat.com>
qa/standalone/scrub/osd-scrub-test.sh
src/mon/PGMap.cc
src/osd/osd_types.cc
src/osd/osd_types.h
src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/pg_scrubber.h
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_machine.h
src/osd/scrubber/scrub_machine_lstnr.h

index 6cea0b8fcf8321b4582e790618d981f186243cff..9105474e30f21d1c86863a6e12ef6155dbc7a99e 100755 (executable)
@@ -454,6 +454,43 @@ function TEST_scrub_permit_time() {
     teardown $dir || return 1
 }
 
+function TEST_pg_dump_scrub_duration() {
+    local dir=$1
+    local poolname=test
+    local OSDS=3
+    local objects=15
+
+    TESTDATA="testdata.$$"
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=$OSDS || return 1
+    run_mgr $dir x || return 1
+    for osd in $(seq 0 $(expr $OSDS - 1))
+    do
+      run_osd $dir $osd || return 1
+    done
+
+    # Create a pool with a single pg
+    create_pool $poolname 1 1
+    wait_for_clean || return 1
+    poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }')
+
+    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+    for i in `seq 1 $objects`
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+    rm -f $TESTDATA
+
+    local pgid="${poolid}.0"
+    pg_scrub $pgid || return 1
+
+    ceph pg $pgid query | jq '.info.stats.scrub_duration'
+    test "$(ceph pg $pgid query | jq '.info.stats.scrub_duration')" '>' "0" || return 1
+
+    teardown $dir || return 1
+}
+
 main osd-scrub-test "$@"
 
 # Local Variables:
index cde8f0dd0cd60deaf30fa7648c57930b03d5609a..cb90535addd22de9fc1b33cdea769cf888ebbc41 100644 (file)
@@ -1657,6 +1657,7 @@ void PGMap::dump_pg_stats_plain(
     tab.define_column("LAST_DEEP_SCRUB", TextTable::LEFT, TextTable::RIGHT);
     tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
     tab.define_column("SNAPTRIMQ_LEN", TextTable::LEFT, TextTable::RIGHT);
+    tab.define_column("SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
   }
 
   for (auto i = pg_stats.begin();
@@ -1698,6 +1699,7 @@ void PGMap::dump_pg_stats_plain(
           << st.last_deep_scrub
           << st.last_deep_scrub_stamp
           << st.snaptrimq_len
+          << st.scrub_duration
           << TextTable::endrow;
     }
   }
@@ -2228,6 +2230,7 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
   tab.define_column("ACTING", TextTable::LEFT, TextTable::RIGHT);
   tab.define_column("SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
   tab.define_column("DEEP_SCRUB_STAMP", TextTable::LEFT, TextTable::RIGHT);
+  tab.define_column("SCRUB_DURATION", TextTable::LEFT, TextTable::RIGHT);
 
   for (auto i = pgs.begin(); i != pgs.end(); ++i) {
     const pg_stat_t& st = pg_stat.at(*i);
@@ -2255,6 +2258,7 @@ void PGMap::dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const
         << actingstr.str()
         << st.last_scrub_stamp
         << st.last_deep_scrub_stamp
+        << st.scrub_duration
         << TextTable::endrow;
   }
 
index 4e1a9b6b8ae3ea9dc23c347afca3cccee71a8aa0..3bb4dfbab4e4f5b01ed1299be5d026aff0e1a4da 100644 (file)
@@ -2856,6 +2856,7 @@ void pg_stat_t::dump(Formatter *f) const
   f->dump_bool("pin_stats_invalid", pin_stats_invalid);
   f->dump_bool("manifest_stats_invalid", manifest_stats_invalid);
   f->dump_unsigned("snaptrimq_len", snaptrimq_len);
+  f->dump_float("scrub_duration", scrub_duration);
   stats.dump(f);
   f->open_array_section("up");
   for (auto p = up.cbegin(); p != up.cend(); ++p)
@@ -2910,7 +2911,7 @@ void pg_stat_t::dump_brief(Formatter *f) const
 
 void pg_stat_t::encode(ceph::buffer::list &bl) const
 {
-  ENCODE_START(26, 22, bl);
+  ENCODE_START(27, 22, bl);
   encode(version, bl);
   encode(reported_seq, bl);
   encode(reported_epoch, bl);
@@ -2958,6 +2959,7 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const
   encode(manifest_stats_invalid, bl);
   encode(avail_no_missing, bl);
   encode(object_location_counts, bl);
+  encode(scrub_duration, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -3032,6 +3034,9 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
       decode(avail_no_missing, bl);
       decode(object_location_counts, bl);
     }
+    if (struct_v >= 27) {
+      decode(scrub_duration, bl);
+    }
   }
   DECODE_FINISH(bl);
 }
@@ -3064,6 +3069,7 @@ void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
   a.last_deep_scrub = eversion_t(13, 14);
   a.last_deep_scrub_stamp = utime_t(15, 16);
   a.last_clean_scrub_stamp = utime_t(17, 18);
+  a.scrub_duration = 0.003;
   a.snaptrimq_len = 1048576;
   list<object_stat_collection_t*> l;
   object_stat_collection_t::generate_test_instances(l);
@@ -3137,7 +3143,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r)
     l.pin_stats_invalid == r.pin_stats_invalid &&
     l.manifest_stats_invalid == r.manifest_stats_invalid &&
     l.purged_snaps == r.purged_snaps &&
-    l.snaptrimq_len == r.snaptrimq_len;
+    l.snaptrimq_len == r.snaptrimq_len &&
+    l.scrub_duration == r.scrub_duration;
 }
 
 // -- store_statfs_t --
index 4674085838942d3f250bd3f2462079250d751089..4f65e14d55d9fbe39f63320c03c9f37d1c532a52 100644 (file)
@@ -2249,6 +2249,8 @@ struct pg_stat_t {
   bool pin_stats_invalid:1;
   bool manifest_stats_invalid:1;
 
+  double scrub_duration;
+
   pg_stat_t()
     : reported_seq(0),
       reported_epoch(0),
@@ -2266,7 +2268,8 @@ struct pg_stat_t {
       hitset_stats_invalid(false),
       hitset_bytes_stats_invalid(false),
       pin_stats_invalid(false),
-      manifest_stats_invalid(false)
+      manifest_stats_invalid(false),
+      scrub_duration(0)
   { }
 
   epoch_t get_effective_last_epoch_clean() const {
index dce25081a3a780b7fa4de1c3a8259f9f2d82bd58..df7fe95a41db819b6ee50887c471b5a190e2c7db 100644 (file)
@@ -2008,6 +2008,20 @@ PgScrubber::PgScrubber(PG* pg)
                                                     m_osds->get_nodeid());
 }
 
+void PgScrubber::set_scrub_begin_time() {
+  scrub_begin_stamp = ceph_clock_now();
+}
+
+void PgScrubber::set_scrub_duration() {
+   utime_t stamp = ceph_clock_now();
+   utime_t duration = stamp - scrub_begin_stamp;
+   m_pg->recovery_state.update_stats(
+      [=](auto &history, auto &stats) {
+       stats.scrub_duration = double(duration);
+  return true;
+    });
+}
+
 void PgScrubber::reserve_replicas()
 {
   dout(10) << __func__ << dendl;
index 9077bfcf3bcf694a88752380c7fe49761835188c..a9845efb1dd5c24027466bba6352f5cc23ddb46f 100644 (file)
@@ -421,6 +421,12 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 
   std::string dump_awaited_maps() const final;
 
+  void set_scrub_begin_time() final;
+
+  void set_scrub_duration() final;
+
+  utime_t scrub_begin_stamp;
+
  protected:
   bool state_test(uint64_t m) const { return m_pg->state_test(m); }
   void state_set(uint64_t m) { m_pg->state_set(m); }
index 4f9ed5e7f8300858f18330345bad3db70250da9e..d59fbe7e7f1c70fb4e45a60722d44477874e43ac 100644 (file)
@@ -84,6 +84,14 @@ NotActive::NotActive(my_context ctx) : my_base(ctx)
   dout(10) << "-- state -->> NotActive" << dendl;
 }
 
+sc::result NotActive::react(const StartScrub&)
+{
+  dout(10) << "NotActive::react(const StartScrub&)" << dendl;
+  DECLARE_LOCALS;
+  scrbr->set_scrub_begin_time();
+  return transit<ReservingReplicas>();
+}
+
 // ----------------------- ReservingReplicas ---------------------------------
 
 ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx)
@@ -440,6 +448,14 @@ sc::result WaitDigestUpdate::react(const DigestUpdate&)
   return discard_event();
 }
 
+sc::result WaitDigestUpdate::react(const ScrubFinished&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "WaitDigestUpdate::react(const ScrubFinished&)" << dendl;
+  scrbr->set_scrub_duration();
+  return transit<NotActive>();
+}
+
 ScrubMachine::ScrubMachine(PG* pg, ScrubMachineListener* pg_scrub)
     : m_pg{pg}, m_pg_id{pg->pg_id}, m_scrbr{pg_scrub}
 {
index f75c5acdc2ecbf0b6fe2e19fe4dfcedfe4aa178b..f3c8d2ced603afd9a921827c695fb508ae50c998 100644 (file)
@@ -153,12 +153,13 @@ class ScrubMachine : public sc::state_machine<ScrubMachine, NotActive> {
 struct NotActive : sc::state<NotActive, ScrubMachine> {
   explicit NotActive(my_context ctx);
 
-  using reactions = mpl::list<sc::transition<StartScrub, ReservingReplicas>,
+  using reactions = mpl::list<sc::custom_reaction<StartScrub>,
                              // a scrubbing that was initiated at recovery completion,
                              // and requires no resource reservations:
                              sc::transition<AfterRepairScrub, ReservingReplicas>,
                              sc::transition<StartReplica, ReplicaWaitUpdates>,
                              sc::transition<StartReplicaNoWait, ActiveReplica>>;
+  sc::result react(const StartScrub&);
 };
 
 struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine> {
@@ -310,9 +311,10 @@ struct WaitDigestUpdate : sc::state<WaitDigestUpdate, ActiveScrubbing> {
   explicit WaitDigestUpdate(my_context ctx);
 
   using reactions = mpl::list<sc::custom_reaction<DigestUpdate>,
-                             sc::transition<NextChunk, PendingTimer>,
-                             sc::transition<ScrubFinished, NotActive>>;
+                            sc::custom_reaction<ScrubFinished>,
+                            sc::transition<NextChunk, PendingTimer>>;
   sc::result react(const DigestUpdate&);
+  sc::result react(const ScrubFinished&);
 };
 
 // ----------------------------- the "replica active" states -----------------------
index 28745d469d998638525e9306ad8767d210a90995..72ff4b7fdd55f4086ce0b9a7cbb95ac2b1a0282c 100644 (file)
@@ -148,6 +148,10 @@ struct ScrubMachineListener {
 
   virtual void unreserve_replicas() = 0;
 
+  virtual void set_scrub_begin_time() = 0;
+
+  virtual void set_scrub_duration() = 0;
+
   /**
    * No new scrub session will start while a scrub was initiate on a PG,
    * and that PG is trying to acquire replica resources.