From: Loic Dachary Date: Sat, 28 Nov 2015 16:55:46 +0000 (+0100) Subject: tests: fix race condition testing auto scrub X-Git-Tag: v10.0.2~156^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F6724%2Fhead;p=ceph.git tests: fix race condition testing auto scrub When testing auto scrub, waiting 20 seconds for the scrub to complete is sometimes not enough and creates false negatives. Split wait_for_scrub out of the repair helper so that it can be used to wait for the scrub to happen instead of using a timer. The scrub timestamp is obtained after removing the object, therefore there is a chance for the scrub to be finished already. But since auto scrub is scheduled every 5 seconds, it will only make the test wait an extra 5 seconds and not hang forever. http://tracker.ceph.com/issues/13592 Signed-off-by: Xinze Chi Signed-off-by: Loic Dachary --- diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh index 05b1a678fb03..39eb54023ef7 100755 --- a/qa/workunits/ceph-helpers.sh +++ b/qa/workunits/ceph-helpers.sh @@ -1019,9 +1019,7 @@ function test_wait_for_clean() { ## # Run repair on **pgid** and wait until it completes. The repair # function will fail if repair does not complete within $TIMEOUT -# seconds. The repair is complete whenever the -# **get_last_scrub_stamp** function reports a timestamp different from -# the one stored before starting the repair. +# seconds. # # @param pgid the id of the PG # @return 0 on success, 1 on error @@ -1029,15 +1027,8 @@ function test_wait_for_clean() { function repair() { local pgid=$1 local last_scrub=$(get_last_scrub_stamp $pgid) - ceph pg repair $pgid - for ((i=0; i < $TIMEOUT; i++)); do - if test "$last_scrub" != "$(get_last_scrub_stamp $pgid)" ; then - return 0 - fi - sleep 1 - done - return 1 + wait_for_scrub $pgid "$last_scrub" } function test_repair() { @@ -1106,6 +1097,48 @@ function test_expect_failure() { ####################################################################### +## +# Given the *last_scrub*, wait for scrub to happen on **pgid**. It +# will fail if scrub does not complete within $TIMEOUT seconds. The +# repair is complete whenever the **get_last_scrub_stamp** function +# reports a timestamp different from the one given in argument. +# +# @param pgid the id of the PG +# @param last_scrub timestamp of the last scrub for *pgid* +# @return 0 on success, 1 on error +# +function wait_for_scrub() { + local pgid=$1 + local last_scrub="$2" + + for ((i=0; i < $TIMEOUT; i++)); do + if test "$last_scrub" != "$(get_last_scrub_stamp $pgid)" ; then + return 0 + fi + sleep 1 + done + return 1 +} + +function test_wait_for_scrub() { + local dir=$1 + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=1 || return 1 + run_osd $dir 0 || return 1 + wait_for_clean || return 1 + local pgid=1.0 + ceph pg repair $pgid + local last_scrub=$(get_last_scrub_stamp $pgid) + wait_for_scrub $pgid "$last_scrub" || return 1 + kill_daemons $dir KILL osd || return 1 + last_scrub=$(get_last_scrub_stamp $pgid) + ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1 + teardown $dir || return 1 +} + +####################################################################### + ## # Return 0 if the erasure code *plugin* is available, 1 otherwise. # diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh index ca329975ac41..2b22ab906b42 100755 --- a/src/test/osd/osd-scrub-repair.sh +++ b/src/test/osd/osd-scrub-repair.sh @@ -159,25 +159,24 @@ function TEST_auto_repair_erasure_coded() { --osd-scrub-min-interval=5 \ --osd-scrub-interval-randomize-ratio=0 done - wait_for_clean || return 1 # Create an EC pool ceph osd erasure-code-profile set myprofile \ k=2 m=1 ruleset-failure-domain=osd || return 1 ceph osd pool create $poolname 8 8 erasure myprofile || return 1 - wait_for_clean || return 1 # Put an object local payload=ABCDEF echo $payload > $dir/ORIGINAL rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1 + wait_for_clean || return 1 # Remove the object from one shard physically objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 - - # Give some time for auto repair - sleep 20 - + # Wait for auto repair + local pgid=$(get_pg $poolname SOMETHING) + wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)" + wait_for_clean || return 1 # Verify - the file should be back objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1 rados --pool $poolname get SOMETHING $dir/COPY || return 1