From 57abdb11fae9f94417e1ee6c9427a33f729a9703 Mon Sep 17 00:00:00 2001
From: David Zafman <dzafman@redhat.com>
Date: Tue, 19 Mar 2019 13:55:36 -0700
Subject: [PATCH] osd, test: Add num_shards_repaired to osd_stat_t for pushes
 with repair set 3(3)

Fixes: http://tracker.ceph.com/issues/38616

Signed-off-by: David Zafman <dzafman@redhat.com>
---
 qa/standalone/scrub/osd-scrub-repair.sh | 101 +++++++++++++++++++++++-
 src/messages/MOSDPGPush.h               |   9 ++-
 src/osd/ECBackend.cc                    |  11 ++-
 src/osd/ECBackend.h                     |   3 +-
 src/osd/OSD.cc                          |   7 ++
 src/osd/OSD.h                           |   1 +
 src/osd/PGBackend.h                     |   2 +
 src/osd/PrimaryLogPG.h                  |   6 ++
 src/osd/ReplicatedBackend.cc            |  16 +++-
 src/osd/ReplicatedBackend.h             |   2 +-
 src/osd/osd_types.cc                    |  12 ++-
 src/osd/osd_types.h                     |   7 +-
 12 files changed, 163 insertions(+), 14 deletions(-)

diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh
index 2af971f157651..b62e2c086c813 100755
--- a/qa/standalone/scrub/osd-scrub-repair.sh
+++ b/qa/standalone/scrub/osd-scrub-repair.sh
@@ -498,7 +498,9 @@ function TEST_auto_repair_bluestore_failed_norecov() {
 function TEST_repair_stats() {
     local dir=$1
     local poolname=testpool
+    local OSDS=2
     local OBJS=30
+    # This need to be an even number
     local REPAIRS=20
 
     # Launch a cluster with 5 seconds scrub interval
@@ -507,7 +509,7 @@ function TEST_repair_stats() {
     run_mgr $dir x || return 1
     local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
             --osd-scrub-interval-randomize-ratio=0"
-    for id in $(seq 0 2) ; do
+    for id in $(seq 0 $(expr $OSDS - 1)) ; do
         run_osd_bluestore $dir $id $ceph_osd_args || return 1
     done
 
@@ -530,20 +532,115 @@ function TEST_repair_stats() {
     local primary=$(get_primary $poolname obj1)
 
     kill_daemons $dir TERM osd.$other >&2 < /dev/null || return 1
+    kill_daemons $dir TERM osd.$primary >&2 < /dev/null || return 1
+    for i in $(seq 1 $REPAIRS)
+    do
+      # Remove from both osd.0 and osd.1
+      OSD=$(expr $i % 2)
+      _objectstore_tool_nodown $dir $OSD obj$i remove || return 1
+    done
+    run_osd_bluestore $dir $primary $ceph_osd_args || return 1
+    run_osd_bluestore $dir $other $ceph_osd_args || return 1
+    wait_for_clean || return 1
+
+    repair $pgid
+    wait_for_clean || return 1
+    ceph pg dump pgs
+
+    # This should have caused 1 object to be repaired
+    ceph pg $pgid query | jq '.info.stats.stat_sum'
+    COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
+    test "$COUNT" = "$REPAIRS" || return 1
+
+    ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $primary )"
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $primary ).num_shards_repaired")
+    test "$COUNT" = "$(expr $REPAIRS / 2)" || return 1
+
+    ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $other )"
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $other ).num_shards_repaired")
+    test "$COUNT" = "$(expr $REPAIRS / 2)" || return 1
+
+    ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum"
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$REPAIRS" || return 1
+
+    # Tear down
+    teardown $dir || return 1
+}
+
+function TEST_repair_stats_ec() {
+    local dir=$1
+    local poolname=testpool
+    local OSDS=3
+    local OBJS=30
+    # This need to be an even number
+    local REPAIRS=26
+    local allow_overwrites=false
+
+    # Launch a cluster with 5 seconds scrub interval
+    setup $dir || return 1
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    local ceph_osd_args="--osd_deep_scrub_randomize_ratio=0 \
+            --osd-scrub-interval-randomize-ratio=0"
+    for id in $(seq 0 $(expr $OSDS - 1)) ; do
+        run_osd_bluestore $dir $id $ceph_osd_args || return 1
+    done
+
+    # Create an EC pool
+    create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1
+
+    # Put an object
+    local payload=ABCDEF
+    echo $payload > $dir/ORIGINAL
+    for i in $(seq 1 $OBJS)
+    do
+      rados --pool $poolname put obj$i $dir/ORIGINAL || return 1
+    done
+
+    # Remove the object from one shard physically
+    # Restarted osd get $ceph_osd_args passed
+    local other=$(get_not_primary $poolname obj1)
+    local pgid=$(get_pg $poolname obj1)
+    local primary=$(get_primary $poolname obj1)
+
+    kill_daemons $dir TERM osd.$other >&2 < /dev/null || return 1
+    kill_daemons $dir TERM osd.$primary >&2 < /dev/null || return 1
     for i in $(seq 1 $REPAIRS)
     do
-      _objectstore_tool_nodown $dir $other obj$i remove || return 1
+      # Remove from both osd.0 and osd.1
+      OSD=$(expr $i % 2)
+      _objectstore_tool_nodown $dir $OSD obj$i remove || return 1
     done
+    run_osd_bluestore $dir $primary $ceph_osd_args || return 1
     run_osd_bluestore $dir $other $ceph_osd_args || return 1
+    wait_for_clean || return 1
 
     repair $pgid
     wait_for_clean || return 1
     ceph pg dump pgs
 
     # This should have caused 1 object to be repaired
+    ceph pg $pgid query | jq '.info.stats.stat_sum'
     COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired')
     test "$COUNT" = "$REPAIRS" || return 1
 
+    for osd in $(seq 0 $(expr $OSDS - 1)) ; do
+      if [ $osd = $other -o $osd = $primary ]; then
+        repair=$(expr $REPAIRS / 2)
+      else
+        repair="0"
+      fi
+
+      ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $osd )"
+      COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats[] | select(.osd == $osd ).num_shards_repaired")
+      test "$COUNT" = "$repair" || return 1
+    done
+
+    ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum"
+    COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
+    test "$COUNT" = "$REPAIRS" || return 1
+
     # Tear down
     teardown $dir || return 1
 }
diff --git a/src/messages/MOSDPGPush.h b/src/messages/MOSDPGPush.h
index d7da913306728..3960ad70ea387 100644
--- a/src/messages/MOSDPGPush.h
+++ b/src/messages/MOSDPGPush.h
@@ -21,7 +21,7 @@ class MOSDPGPush : public MessageInstance<MOSDPGPush, MOSDFastDispatchOp> {
 public:
   friend factory;
 private:
-  static constexpr int HEAD_VERSION = 3;
+  static constexpr int HEAD_VERSION = 4;
   static constexpr int COMPAT_VERSION = 2;
 
 public:
@@ -29,6 +29,7 @@ public:
   spg_t pgid;
   epoch_t map_epoch = 0, min_epoch = 0;
   vector<PushOp> pushes;
+  bool is_repair = false;
 
 private:
   uint64_t cost;
@@ -79,6 +80,11 @@ public:
     } else {
       min_epoch = map_epoch;
     }
+    if (header.version >= 4) {
+      decode(is_repair, p);
+    } else {
+      is_repair = false;
+    }
   }
 
   void encode_payload(uint64_t features) override {
@@ -90,6 +96,7 @@ public:
     encode(pgid.shard, payload);
     encode(from, payload);
     encode(min_epoch, payload);
+    encode(is_repair, payload);
   }
 
   std::string_view get_type_name() const override { return "MOSDPGPush"; }
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index dfa1f4276b27e..36a77cc7417a9 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -285,7 +285,8 @@ struct RecoveryMessages {
 
 void ECBackend::handle_recovery_push(
   const PushOp &op,
-  RecoveryMessages *m)
+  RecoveryMessages *m,
+  bool is_repair)
 {
   if (get_parent()->check_failsafe_full()) {
     dout(10) << __func__ << " Out of space (failsafe) processing push request." << dendl;
@@ -361,6 +362,8 @@ void ECBackend::handle_recovery_push(
     if ((get_parent()->pgb_is_primary())) {
       ceph_assert(recovery_ops.count(op.soid));
       ceph_assert(recovery_ops[op.soid].obc);
+      if (get_parent()->pg_is_repair())
+        get_parent()->inc_osd_stat_repaired();
       get_parent()->on_local_recover(
 	op.soid,
 	op.recovery_info,
@@ -368,6 +371,9 @@ void ECBackend::handle_recovery_push(
 	false,
 	&m->t);
     } else {
+      // If primary told us this is a repair, bump osd_stat_t::num_objects_repaired
+      if (is_repair)
+        get_parent()->inc_osd_stat_repaired();
       get_parent()->on_local_recover(
 	op.soid,
 	op.recovery_info,
@@ -517,6 +523,7 @@ void ECBackend::dispatch_recovery_messages(RecoveryMessages &m, int priority)
     msg->pgid = spg_t(get_parent()->get_info().pgid.pgid, i->first.shard);
     msg->pushes.swap(i->second);
     msg->compute_cost(cct);
+    msg->is_repair = get_parent()->pg_is_repair();
     get_parent()->send_message(
       i->first.osd,
       msg);
@@ -825,7 +832,7 @@ bool ECBackend::_handle_message(
     for (vector<PushOp>::const_iterator i = op->pushes.begin();
 	 i != op->pushes.end();
 	 ++i) {
-      handle_recovery_push(*i, &rm);
+      handle_recovery_push(*i, &rm, op->is_repair);
     }
     dispatch_recovery_messages(rm, priority);
     return true;
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index 89d5dcbcb5240..e003a08c73667 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -306,7 +306,8 @@ private:
     RecoveryMessages *m);
   void handle_recovery_push(
     const PushOp &op,
-    RecoveryMessages *m);
+    RecoveryMessages *m,
+    bool is_repair);
   void handle_recovery_push_reply(
     const PushReplyOp &op,
     pg_shard_t from,
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 020fb437294e1..d32c73b25d568 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -945,6 +945,13 @@ osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
   return osd_stat;
 }
 
+void OSDService::inc_osd_stat_repaired()
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired++;
+  return;
+}
+
 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
 				         uint64_t adjust_used)
 {
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 12ceefc9f5f0d..ba01a8eb46379 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -907,6 +907,7 @@ public:
   void set_statfs(const struct store_statfs_t &stbuf,
     osd_alert_list_t& alerts);
   osd_stat_t set_osd_stat(vector<int>& hb_peers, int num_pgs);
+  void inc_osd_stat_repaired(void);
   float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
   osd_stat_t get_osd_stat() {
     std::lock_guard l(stat_lock);
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 75f895b1b39cb..fa1354c70b269 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -294,6 +294,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
 
+     virtual bool pg_is_repair() = 0;
+     virtual void inc_osd_stat_repaired() = 0;
      virtual bool pg_is_remote_backfilling() = 0;
      virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
      virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index e5f47b6a6c851..c0f4afb1846ad 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -402,6 +402,12 @@ public:
     release_object_locks(manager);
   }
 
+  bool pg_is_repair() override {
+    return is_repair();
+  }
+  void inc_osd_stat_repaired() override {
+    osd->inc_osd_stat_repaired();
+  }
   bool pg_is_remote_backfilling() override {
     return is_remote_backfilling();
   }
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 2602bbd24e31a..9614a58997343 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -750,7 +750,7 @@ void ReplicatedBackend::_do_push(OpRequestRef op)
        i != m->pushes.end();
        ++i) {
     replies.push_back(PushReplyOp());
-    handle_push(from, *i, &(replies.back()), &t);
+    handle_push(from, *i, &(replies.back()), &t, m->is_repair);
   }
 
   MOSDPGPushReply *reply = new MOSDPGPushReply;
@@ -1726,8 +1726,10 @@ bool ReplicatedBackend::handle_pull_response(
   if (complete) {
     pi.stat.num_objects_recovered++;
     // XXX: This could overcount if regular recovery is needed right after a repair
-    if (get_parent()->pg_is_repair())
+    if (get_parent()->pg_is_repair()) {
       pi.stat.num_objects_repaired++;
+      get_parent()->inc_osd_stat_repaired();
+    }
     clear_pull_from(piter);
     to_continue->push_back({hoid, pi.stat});
     get_parent()->on_local_recover(
@@ -1743,7 +1745,7 @@ bool ReplicatedBackend::handle_pull_response(
 
 void ReplicatedBackend::handle_push(
   pg_shard_t from, const PushOp &pop, PushReplyOp *response,
-  ObjectStore::Transaction *t)
+  ObjectStore::Transaction *t, bool is_repair)
 {
   dout(10) << "handle_push "
 	   << pop.recovery_info
@@ -1767,13 +1769,18 @@ void ReplicatedBackend::handle_push(
 		   pop.omap_entries,
 		   t);
 
-  if (complete)
+  if (complete) {
+    if (is_repair) {
+      get_parent()->inc_osd_stat_repaired();
+      dout(20) << __func__ << " repair complete" << dendl;
+    }
     get_parent()->on_local_recover(
       pop.recovery_info.soid,
       pop.recovery_info,
       ObjectContextRef(), // ok, is replica
       false,
       t);
+  }
 }
 
 void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes)
@@ -1796,6 +1803,7 @@ void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &
       msg->map_epoch = get_osdmap_epoch();
       msg->min_epoch = get_parent()->get_last_peering_reset_epoch();
       msg->set_priority(prio);
+      msg->is_repair = get_parent()->pg_is_repair();
       for (;
            (j != i->second.end() &&
 	    cost < cct->_conf->osd_max_push_cost &&
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index c5593080deed0..8f447495a4ed1 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -244,7 +244,7 @@ private:
     list<pull_complete_info> *to_continue,
     ObjectStore::Transaction *t);
   void handle_push(pg_shard_t from, const PushOp &op, PushReplyOp *response,
-		   ObjectStore::Transaction *t);
+		   ObjectStore::Transaction *t, bool is_repair);
 
   static void trim_pushed_data(const interval_set<uint64_t> &copy_subset,
 			       const interval_set<uint64_t> &intervals_received,
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 3830167e248b1..af3f0d70a01c3 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -381,6 +381,7 @@ void osd_stat_t::dump(Formatter *f) const
   f->close_section();
   f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
   f->dump_int("num_snap_trimming", num_snap_trimming);
+  f->dump_int("num_shards_repaired", num_shards_repaired);
   f->open_object_section("op_queue_age_hist");
   op_queue_age_hist.dump(f);
   f->close_section();
@@ -394,7 +395,7 @@ void osd_stat_t::dump(Formatter *f) const
 
 void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
 {
-  ENCODE_START(10, 2, bl);
+  ENCODE_START(11, 2, bl);
 
   //////// for compatibility ////////
   int64_t kb = statfs.kb();
@@ -425,6 +426,7 @@ void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
   encode(statfs, bl);
   ///////////////////////////////////
   encode(os_alerts, bl);
+  encode(num_shards_repaired, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -432,7 +434,7 @@ void osd_stat_t::decode(bufferlist::const_iterator &bl)
 {
   int64_t kb, kb_used,kb_avail;
   int64_t kb_used_data, kb_used_omap, kb_used_meta;
-  DECODE_START_LEGACY_COMPAT_LEN(10, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(11, 2, 2, bl);
   decode(kb, bl);
   decode(kb_used, bl);
   decode(kb_avail, bl);
@@ -487,6 +489,11 @@ void osd_stat_t::decode(bufferlist::const_iterator &bl)
   } else {
     os_alerts.clear();
   }
+  if (struct_v >= 11) {
+    decode(num_shards_repaired, bl);
+  } else {
+    num_shards_repaired = 0;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -501,6 +508,7 @@ void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
   o.back()->hb_peers.push_back(7);
   o.back()->snap_trim_queue_len = 8;
   o.back()->num_snap_trimming = 99;
+  o.back()->num_shards_repaired = 101;
   o.back()->os_alerts[0].emplace(
     "some alert", "some alert details");
   o.back()->os_alerts[1].emplace(
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index fbcc71ba3474f..440d43357d388 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2321,6 +2321,7 @@ struct osd_stat_t {
   store_statfs_t statfs;
   vector<int> hb_peers;
   int32_t snap_trim_queue_len, num_snap_trimming;
+  uint64_t num_shards_repaired;
 
   pow2_hist_t op_queue_age_hist;
 
@@ -2332,12 +2333,14 @@ struct osd_stat_t {
 
   uint32_t num_pgs = 0;
 
-  osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0) {}
+  osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
+       num_shards_repaired(0)	{}
 
  void add(const osd_stat_t& o) {
     statfs.add(o.statfs);
     snap_trim_queue_len += o.snap_trim_queue_len;
     num_snap_trimming += o.num_snap_trimming;
+    num_shards_repaired += o.num_shards_repaired;
     op_queue_age_hist.add(o.op_queue_age_hist);
     os_perf_stat.add(o.os_perf_stat);
     num_pgs += o.num_pgs;
@@ -2352,6 +2355,7 @@ struct osd_stat_t {
     statfs.sub(o.statfs);
     snap_trim_queue_len -= o.snap_trim_queue_len;
     num_snap_trimming -= o.num_snap_trimming;
+    num_shards_repaired -= o.num_shards_repaired;
     op_queue_age_hist.sub(o.op_queue_age_hist);
     os_perf_stat.sub(o.os_perf_stat);
     num_pgs -= o.num_pgs;
@@ -2376,6 +2380,7 @@ inline bool operator==(const osd_stat_t& l, const osd_stat_t& r) {
   return l.statfs == r.statfs &&
     l.snap_trim_queue_len == r.snap_trim_queue_len &&
     l.num_snap_trimming == r.num_snap_trimming &&
+    l.num_shards_repaired == r.num_shards_repaired &&
     l.hb_peers == r.hb_peers &&
     l.op_queue_age_hist == r.op_queue_age_hist &&
     l.os_perf_stat == r.os_perf_stat &&
-- 
2.39.5