From 410e230d091567641a7c1b61a93aa7164aab9b1b Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 30 Oct 2020 01:28:00 +0000 Subject: [PATCH] test: Fix race in TEST_recovery_scrub test Fixes: https://tracker.ceph.com/issues/47930 Signed-off-by: David Zafman --- qa/standalone/scrub/osd-recovery-scrub.sh | 143 ++++++++++++++++++++-- 1 file changed, 132 insertions(+), 11 deletions(-) diff --git a/qa/standalone/scrub/osd-recovery-scrub.sh b/qa/standalone/scrub/osd-recovery-scrub.sh index 0249b9fb9faf4..944c16a02aa24 100755 --- a/qa/standalone/scrub/osd-recovery-scrub.sh +++ b/qa/standalone/scrub/osd-recovery-scrub.sh @@ -32,7 +32,107 @@ function run() { done } -function TEST_recovery_scrub() { +# Simple test for "not scheduling scrubs due to active recovery" +# OSD::sched_scrub() called on all OSDs during ticks +function TEST_recovery_scrub_1() { + local dir=$1 + local poolname=test + + TESTDATA="testdata.$$" + OSDS=4 + PGS=1 + OBJECTS=100 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ + --osd_scrub_interval_randomize_ratio=0.0 || return 1 + run_mgr $dir x || return 1 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd --osd_scrub_during_recovery=false || return 1 + done + + # Create a pool with $PGS pgs + create_pool $poolname $PGS $PGS + wait_for_clean || return 1 + poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }') + + ceph pg dump pgs + + dd if=/dev/urandom of=$TESTDATA bs=1M count=50 + for i in $(seq 1 $OBJECTS) + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + ceph osd pool set $poolname size 4 + + # Wait for recovery to start + set -o pipefail + count=0 + while(true) + do + if ceph --format json pg dump pgs | + jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true + then + break + fi + sleep 2 + if test "$count" -eq "10" + then + echo "Recovery never started" + return 1 + fi + count=$(expr $count + 1) + done + set +o pipefail + ceph pg dump pgs + + sleep 10 + # Work around for http://tracker.ceph.com/issues/38195 + kill_daemons $dir #|| return 1 + + declare -a err_strings + err_strings[0]="not scheduling scrubs due to active recovery" + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + grep "not scheduling scrubs" $dir/osd.${osd}.log + done + for err_string in "${err_strings[@]}" + do + found=false + count=0 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + if grep -q "$err_string" $dir/osd.${osd}.log + then + found=true + count=$(expr $count + 1) + fi + done + if [ "$found" = "false" ]; then + echo "Missing log message '$err_string'" + ERRORS=$(expr $ERRORS + 1) + fi + [ $count -eq $OSDS ] || return 1 + done + + teardown $dir || return 1 + + if [ $ERRORS != "0" ]; + then + echo "TEST FAILED WITH $ERRORS ERRORS" + return 1 + fi + + echo "TEST PASSED" + return 0 +} + +# osd_scrub_during_recovery=true make sure scrub happens +function TEST_recovery_scrub_2() { local dir=$1 local poolname=test @@ -42,11 +142,12 @@ function TEST_recovery_scrub() { OBJECTS=4 setup $dir || return 1 - run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true || return 1 + run_mon $dir a --osd_pool_default_size=1 --mon_allow_pool_size_one=true \ + --osd_scrub_interval_randomize_ratio=0.0 || return 1 run_mgr $dir x || return 1 for osd in $(seq 0 $(expr $OSDS - 1)) do - run_osd $dir $osd || return 1 + run_osd $dir $osd --osd_scrub_during_recovery=true || return 1 done # Create a pool with $PGS pgs @@ -61,7 +162,30 @@ function TEST_recovery_scrub() { done rm -f $TESTDATA - ceph osd pool set $poolname size 4 + ceph osd pool set $poolname size 3 + + ceph pg dump pgs + + # Wait for recovery to start + set -o pipefail + count=0 + while(true) + do + if ceph --format json pg dump pgs | + jq '.pg_stats | [.[] | .state | contains("recovering")]' | grep -q true + then + break + fi + sleep 2 + if test "$count" -eq "10" + then + echo "Recovery never started" + return 1 + fi + count=$(expr $count + 1) + done + set +o pipefail + ceph pg dump pgs pids="" for pg in $(seq 0 $(expr $PGS - 1)) @@ -79,7 +203,7 @@ function TEST_recovery_scrub() { if ! kill -0 $pid then echo "OSD crash occurred" - tail -100 $dir/osd.0.log + #tail -100 $dir/osd.0.log ERRORS=$(expr $ERRORS + 1) fi @@ -88,13 +212,10 @@ function TEST_recovery_scrub() { declare -a err_strings err_strings[0]="not scheduling scrubs due to active recovery" - # Test with these two strings after disabled check in OSD::sched_scrub() - #err_strings[0]="handle_scrub_reserve_request: failed to reserve remotely" - #err_strings[1]="sched_scrub: failed to reserve locally" for osd in $(seq 0 $(expr $OSDS - 1)) do - grep "failed to reserve\|not scheduling scrubs" $dir/osd.${osd}.log + grep "not scheduling scrubs" $dir/osd.${osd}.log done for err_string in "${err_strings[@]}" do @@ -106,8 +227,8 @@ function TEST_recovery_scrub() { found=true fi done - if [ "$found" = "false" ]; then - echo "Missing log message '$err_string'" + if [ "$found" = "true" ]; then + echo "Found log message not expected '$err_string'" ERRORS=$(expr $ERRORS + 1) fi done -- 2.39.5