osd: add clear_shards_repaired command

author DanWritesCode <github@dann.me>

Mon, 18 Dec 2023 21:09:07 +0000 (16:09 -0500)

committer DanWritesCode <github@dann.me>

Mon, 4 Mar 2024 21:08:48 +0000 (16:08 -0500)
author DanWritesCode <github@dann.me>
Mon, 18 Dec 2023 21:09:07 +0000 (16:09 -0500)
committer DanWritesCode <github@dann.me>
Mon, 4 Mar 2024 21:08:48 +0000 (16:08 -0500)
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index 54bfd4279677e62faf0a303f17a8e82d6d138e44..4059911654a4205941dce743fc5a043d88a7392d 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -983,6 +983,13 @@ the object data, there might exist failing disks that are not registering any
  scrub errors. This repair count is maintained as a way of identifying any such
  failing disks.
  
+In order to allow clearing of the warning, a new command
+``ceph tell osd.# clear_shards_repaired [count]`` has been added.
+By default it will set the repair count to 0. A `count` value can be passed 
+to the command. Thus, the administrator has the option to re-enable the warning
+by passing the value of ``mon_osd_warn_num_repaired`` (or above) to the command.
+An alternative to using `clear_shards_repaired` is to mute the
+`OSD_TOO_MANY_REPAIRS` alert with `ceph health mute`.
  
  LARGE_OMAP_OBJECTS
  __________________
diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh

index 6fea441b3a95f7bf99a1ed28905add4c85521200..a34f4a47189af76cfc8ba41768a58979ed296271 100755 (executable)
--- a/qa/standalone/osd/osd-rep-recov-eio.sh
+++ b/qa/standalone/osd/osd-rep-recov-eio.sh
@@ -219,6 +219,18 @@ function TEST_rados_repair_warning() {
      ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
      set +o pipefail
  
+    ceph health unmute OSD_TOO_MANY_REPAIRS
+    ceph tell osd.$primary clear_shards_repaired
+    sleep 10
+
+    set -o pipefail
+    # Should clear this
+    ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1
+    set +o pipefail
+
+    ceph tell osd.$primary clear_shards_repaired $OBJS
+    sleep 10
+
      for i in $(seq 1 $OBJS)
       do
         inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1
@@ -235,7 +247,7 @@ function TEST_rados_repair_warning() {
      COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired")
      test "$COUNT" = "$(expr $OBJS \* 3)" || return 1
  
-    # Give mon a chance to notice additional OSD and unmute
+    # Give mon a chance to notice additional OSD and reset num_shards_repaired
      # The default tick time is 5 seconds
      CHECKTIME=10
      LOOPS=0
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index a14d70605f8a6fc330ca433a31b05bf1062d88b4..a3f887edaa06094f98a18d3532449dbc533c2fb5 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1051,6 +1051,13 @@ void OSDService::inc_osd_stat_repaired()
    return;
  }
  
+void OSDService::set_osd_stat_repaired(int64_t count)
+{
+  std::lock_guard l(stat_lock);
+  osd_stat.num_shards_repaired = count;
+  return;
+}
+
  float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
                                          uint64_t adjust_used)
  {
@@ -3128,6 +3135,11 @@ will start to track new ops received afterwards.";
      scrub_purged_snaps();
    }
  
+  else if (prefix == "clear_shards_repaired") {
+    int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 0);
+    service.set_osd_stat_repaired(count);
+  }
+
    else if (prefix == "reset_purged_snaps_last") {
      lock_guard l(osd_lock);
      superblock.purged_snaps_last = 0;
@@ -4350,6 +4362,12 @@ void OSD::final_init()
      asok_hook,
      "debug the scrubber");
    ceph_assert(r == 0);
+  r = admin_socket->register_command(
+    "clear_shards_repaired "
+    "name=count,type=CephInt,req=false,range=0",
+    asok_hook,
+    "clear num_shards_repaired to clear health warning");
+  ceph_assert(r == 0);
  
    // -- pg commands --
    // old form: ceph pg <pgid> command ...
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 83aed8a410bfa4cd9fbe0d389aee5a74bd45ee55..f59d3ab78822a24ddb1a36cce1c1507b9a189a44 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -716,6 +716,7 @@ public:
      osd_alert_list_t& alerts);
    osd_stat_t set_osd_stat(std::vector<int>& hb_peers, int num_pgs);
    void inc_osd_stat_repaired(void);
+  void set_osd_stat_repaired(int64_t count);
    float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0);
    osd_stat_t get_osd_stat() {
      std::lock_guard l(stat_lock);
author	DanWritesCode <github@dann.me>
	Mon, 18 Dec 2023 21:09:07 +0000 (16:09 -0500)
committer	DanWritesCode <github@dann.me>
	Mon, 4 Mar 2024 21:08:48 +0000 (16:08 -0500)
doc/rados/operations/health-checks.rst		patch \| blob \| history
qa/standalone/osd/osd-rep-recov-eio.sh		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history