From 1a63a63d411457173670c230b1484a74fef9104a Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 30 Jul 2020 00:55:10 +0000 Subject: [PATCH] osd, test: Special Nautilus handling because there is no health mute osd: Add clear_shards_repaired osd tell command test: Replace mute handling in test case Signed-off-by: David Zafman --- PendingReleaseNotes | 7 ++++++- doc/rados/operations/health-checks.rst | 7 +++++++ qa/standalone/osd/osd-rep-recov-eio.sh | 9 +++++++-- src/osd/OSD.cc | 16 ++++++++++++++++ src/osd/OSD.h | 1 + src/osd/PGBackend.h | 1 + src/osd/PrimaryLogPG.h | 3 +++ 7 files changed, 41 insertions(+), 3 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 0cb749f0283f..61af03c31fb0 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -15,4 +15,9 @@ * Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default. If any OSD has repaired more than this many I/O errors in stored data a - ``OSD_TOO_MANY_REPAIRS`` health warning is generated. + ``OSD_TOO_MANY_REPAIRS`` health warning is generated. In order to allow + clearing of the warning, a new command ``ceph tell osd.# clear_shards_repaired [count]`` + has been added. By default it will set the repair count to 0. If you wanted + to be warned again if additional repairs are performed you can provide a value + to the command and specify the value of ``mon_osd_warn_num_repaired``. + This command will be replaced in future releases by the health mute/unmute feature. diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 5a2cb9628064..7bf477e91701 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -594,6 +594,13 @@ that aren't seeing scrub errors, a count of read repairs is maintained. If it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10, this health warning is generated. +In order to allow clearing of the warning, a new command +``ceph tell osd.# clear_shards_repaired [count]`` has been added. +By default it will set the repair count to 0. If the administrator wanted +to re-enable the warning if any additional repairs are performed you can provide +a value to the command and specify the value of ``mon_osd_warn_num_repaired``. +This command will be replaced in future releases by the health mute/unmute feature. + LARGE_OMAP_OBJECTS __________________ diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 613bfc316f7c..ab7c1501e426 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -209,12 +209,17 @@ function TEST_rados_repair_warning() { ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1 ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1 - ceph health mute OSD_TOO_MANY_REPAIRS + ceph tell osd.$primary clear_shards_repaired + sleep 10 + set -o pipefail # Should mute this ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 set +o pipefail + ceph tell osd.$primary clear_shards_repaired $OBJS + sleep 10 + for i in $(seq 1 $OBJS) do inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 @@ -230,7 +235,7 @@ function TEST_rados_repair_warning() { COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 - # Give mon a chance to notice additional OSD and unmute + # Give mon a chance to notice additional OSD and reset num_shards_repaired # The default tick time is 5 seconds CHECKTIME=10 LOOPS=0 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5e6064e0ef68..9c59bce24da8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -982,6 +982,13 @@ void OSDService::inc_osd_stat_repaired() return; } +void OSDService::set_osd_stat_repaired(int64_t count) +{ + std::lock_guard l(stat_lock); + osd_stat.num_shards_repaired = count; + return; +} + float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used) { @@ -6750,6 +6757,10 @@ COMMAND("cluster_log " \ "name=message,type=CephString,n=N", "log a message to the cluster log", "osd", "rw") +COMMAND("clear_shards_repaired " \ + "name=count,type=CephInt,req=false", + "clear num_shards_repaired to clear health warning", + "osd", "rw") COMMAND("bench " \ "name=count,type=CephInt,req=false " \ "name=size,type=CephInt,req=false " \ @@ -6968,6 +6979,11 @@ int OSD::_do_command( } clog->do_log(level, message); } + else if (prefix == "clear_shards_repaired") { + int64_t count; + cmd_getval(cct, cmdmap, "count", count, (int64_t) 0); + service.set_osd_stat_repaired(count); + } // either 'pg ' or // 'tell ' (which comes in without any of that prefix)? diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 80ca9af538cd..8c87823d2da7 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -909,6 +909,7 @@ public: osd_alert_list_t& alerts); osd_stat_t set_osd_stat(vector& hb_peers, int num_pgs); void inc_osd_stat_repaired(void); + void set_osd_stat_repaired(int64_t); float compute_adjusted_ratio(osd_stat_t new_stat, float *pratio, uint64_t adjust_used = 0); osd_stat_t get_osd_stat() { std::lock_guard l(stat_lock); diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index b9641a5305c7..e19695fcd6db 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -298,6 +298,7 @@ typedef std::shared_ptr OSDMapRef; virtual bool pg_is_repair() = 0; virtual void inc_osd_stat_repaired() = 0; + virtual void set_osd_stat_repaired(int64_t) = 0; virtual bool pg_is_remote_backfilling() = 0; virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0; virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0; diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 56fdc87fe609..39f3b04c28db 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -410,6 +410,9 @@ public: void inc_osd_stat_repaired() override { osd->inc_osd_stat_repaired(); } + void set_osd_stat_repaired(int64_t count) override { + osd->set_osd_stat_repaired(count); + } bool pg_is_remote_backfilling() override { return is_remote_backfilling(); } -- 2.47.3