From: DanWritesCode Date: Mon, 18 Dec 2023 21:09:07 +0000 (-0500) Subject: osd: add clear_shards_repaired command X-Git-Tag: v20.0.0~732^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=78d6bfe54c3b9b60fab36a640b1ce77c8f022fa9;p=ceph.git osd: add clear_shards_repaired command This command will allow us to clear the OSD_TOO_MANY_REPAIRS alert by setting the shard repair count to 0. This will help in cases where the alert was a false positive, or a condition that has since cleared at the disk level. Often, zeroing out the repair count is better than muting the alert or restarting the OSD. Fixes: https://tracker.ceph.com/issues/54182 Co-authored-by: David Zafman Signed-off-by: Daniel Radjenovic --- diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 54bfd427967..4059911654a 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -983,6 +983,13 @@ the object data, there might exist failing disks that are not registering any scrub errors. This repair count is maintained as a way of identifying any such failing disks. +In order to allow clearing of the warning, a new command +``ceph tell osd.# clear_shards_repaired [count]`` has been added. +By default it will set the repair count to 0. A `count` value can be passed +to the command. Thus, the administrator has the option to re-enable the warning +by passing the value of ``mon_osd_warn_num_repaired`` (or above) to the command. +An alternative to using `clear_shards_repaired` is to mute the +`OSD_TOO_MANY_REPAIRS` alert with `ceph health mute`. LARGE_OMAP_OBJECTS __________________ diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 6fea441b3a9..a34f4a47189 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -219,6 +219,18 @@ function TEST_rados_repair_warning() { ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph health unmute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + + set -o pipefail + # Should clear this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -235,7 +247,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a14d70605f8..a3f887edaa0 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1051,6 +1051,13 @@ void OSDService::inc_osd_stat_repaired() return; } +void OSDService::set_osd_stat_repaired(int64_t count) +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired = count; + return; +} + float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used) { @@ -3128,6 +3135,11 @@ will start to track new ops received afterwards."; scrub_purged_snaps(); } + else if (prefix == "clear_shards_repaired") { + int64_t count = cmd_getval_or(cmdmap, "count", 0); + service.set_osd_stat_repaired(count); + } + else if (prefix == "reset_purged_snaps_last") { lock_guard l(osd_lock); superblock.purged_snaps_last = 0; @@ -4350,6 +4362,12 @@ void OSD::final_init() asok_hook, "debug the scrubber"); ceph_assert(r == 0); + r = admin_socket->register_command( + "clear_shards_repaired " + "name=count,type=CephInt,req=false,range=0", + asok_hook, + "clear num_shards_repaired to clear health warning"); + ceph_assert(r == 0); // -- pg commands -- // old form: ceph pg command ... diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 83aed8a410b..f59d3ab7882 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -716,6 +716,7 @@ public: osd_alert_list_t& alerts); osd_stat_t set_osd_stat(std::vector& hb_peers, int num_pgs); void inc_osd_stat_repaired(void); + void set_osd_stat_repaired(int64_t count); float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0); osd_stat_t get_osd_stat() { std::lock_guard l(stat_lock);