From: David Zafman Date: Wed, 10 Jun 2020 02:24:00 +0000 (-0700) Subject: mgr: Warn when too many reads are repaired on an OSD X-Git-Tag: wip-pdonnell-testing-20200918.022351~965^2~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=661996d4342c427209b1eae4b0247f8210a00fc3;p=ceph-ci.git mgr: Warn when too many reads are repaired on an OSD Include test case Configurable by setting mon_osd_warn_num_repaired (default 10) Ignore new health warning with random eio injection test Fixes: https://tracker.ceph.com/issues/41564 Signed-off-by: David Zafman --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 1598bc5b5da..994b172de1a 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -77,3 +77,7 @@ librbd to load the plugin by adding the following to your configuration:: rbd_plugins = parent_cache + +* Monitors now have a config option ``mon_osd_warn_num_repaired``, 10 by default. + If any OSD has repaired more than this many I/O errors in stored data a + ``OSD_TOO_MANY_REPAIRS`` health warning is generated. diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 41502376649..47e31b51c04 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -710,6 +710,16 @@ paired with *PG_DAMAGED* (see above). See :doc:`pg-repair` for more information. +OSD_TOO_MANY_REPAIRS +____________________ + +When a read error occurs and another replica is available it is used to repair +the error immediately, so that the client can get the object data. Scrub +handles errors for data at rest. In order to identify possible failing disks +that aren't seeing scrub errors, a count of read repairs is maintained. If +it exceeds a config value threshold *mon_osd_warn_num_repaired* default 10, +this health warning is generated. + LARGE_OMAP_OBJECTS __________________ diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh index 8dce41a98bb..613bfc316f7 100755 --- a/qa/standalone/osd/osd-rep-recov-eio.sh +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -19,6 +19,8 @@ source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +warnings=10 + function run() { local dir=$1 shift @@ -32,7 +34,8 @@ function run() { local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} for func in $funcs ; do setup $dir || return 1 - run_mon $dir a || return 1 + # set warning amount in case default changes + run_mon $dir a --mon_osd_warn_num_repaired=$warnings || return 1 run_mgr $dir x || return 1 ceph osd pool create foo 8 || return 1 @@ -171,6 +174,86 @@ function TEST_rados_get_with_eio() { delete_pool $poolname } +function TEST_rados_repair_warning() { + local dir=$1 + local OBJS=$(expr $warnings + 1) + + setup_osds 4 || return 1 + + local poolname=pool-rep + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + local poolname=pool-rep + local obj-base=obj-warn- + local inject=eio + + for i in $(seq 1 $OBJS) + do + rados_put $dir $poolname ${objbase}-$i || return 1 + inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 + rados_get $dir $poolname ${objbase}-$i || return 1 + done + local pgid=$(get_pg $poolname ${objbase}-1) + + local object_osds=($(get_osds $poolname ${objbase}-1)) + local primary=${object_osds[0]} + local bad_peer=${object_osds[1]} + + COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired') + test "$COUNT" = "$OBJS" || return 1 + flush_pg_stats + COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") + test "$COUNT" = "$OBJS" || return 1 + + ceph health | grep -q "Too many repaired reads on 1 OSDs" || return 1 + ceph health detail | grep -q "osd.$primary had $OBJS reads repaired" || return 1 + + ceph health mute OSD_TOO_MANY_REPAIRS + set -o pipefail + # Should mute this + ceph health | $(! grep -q "Too many repaired reads on 1 OSDs") || return 1 + set +o pipefail + + for i in $(seq 1 $OBJS) + do + inject_$inject rep data $poolname ${objbase}-$i $dir 0 || return 1 + inject_$inject rep data $poolname ${objbase}-$i $dir 1 || return 1 + # Force primary to pull from the bad peer, so we can repair it too! + set_config osd $primary osd_debug_feed_pullee $bad_peer || return 1 + rados_get $dir $poolname ${objbase}-$i || return 1 + done + + COUNT=$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_objects_repaired') + test "$COUNT" = "$(expr $OBJS \* 2)" || return 1 + flush_pg_stats + COUNT=$(ceph pg dump --format=json-pretty | jq ".pg_map.osd_stats_sum.num_shards_repaired") + test "$COUNT" = "$(expr $OBJS \* 3)" || return 1 + + # Give mon a chance to notice additional OSD and unmute + # The default tick time is 5 seconds + CHECKTIME=10 + LOOPS=0 + while(true) + do + sleep 1 + if ceph health | grep -q "Too many repaired reads on 2 OSDs" + then + break + fi + LOOPS=$(expr $LOOPS + 1) + if test "$LOOPS" = "$CHECKTIME" + then + echo "Too many repaired reads not seen after $CHECKTIME seconds" + return 1 + fi + done + ceph health detail | grep -q "osd.$primary had $(expr $OBJS \* 2) reads repaired" || return 1 + ceph health detail | grep -q "osd.$bad_peer had $OBJS reads repaired" || return 1 + + delete_pool $poolname +} + # Test backfill with unfound object function TEST_rep_backfill_unfound() { local dir=$1 diff --git a/qa/suites/rados/singleton/all/random-eio.yaml b/qa/suites/rados/singleton/all/random-eio.yaml index fd120680515..1d5adae7587 100644 --- a/qa/suites/rados/singleton/all/random-eio.yaml +++ b/qa/suites/rados/singleton/all/random-eio.yaml @@ -24,6 +24,7 @@ tasks: - overall HEALTH_ - \(POOL_APP_NOT_ENABLED\) - \(PG_DEGRADED\) + - \(OSD_TOO_MANY_REPAIRS\) - full_sequential: - exec: client.0: diff --git a/src/common/options.cc b/src/common/options.cc index 495dc5273c1..9dfbc317803 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -1522,6 +1522,11 @@ std::vector