From ae2c5331fb5ec3b2c81724d19259302903aabdf5 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 9 Aug 2017 08:43:57 -0700 Subject: [PATCH] qa: Fix races with waiting for scrubs The trigger_scrub sets the last_scrub_stamp backwards to force a scheduled scrub. In a small window this stamp could get propagated to the mgr. A test failure occurred because wait_for_scrub() was confused by seeing a backward moving date. The most critical change is having wait_for_scrub() make sure that the date advances past the previous in value. A test failed because the random backoff kept delayed triggered scrub, so set osd_scrub_backoff throughout. Signed-off-by: David Zafman --- qa/standalone/ceph-helpers.sh | 2 +- qa/standalone/scrub/osd-scrub-repair.sh | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh index 16996b377ec2..1afd45b8893f 100755 --- a/qa/standalone/ceph-helpers.sh +++ b/qa/standalone/ceph-helpers.sh @@ -1619,7 +1619,7 @@ function wait_for_scrub() { local sname=${3:-last_scrub_stamp} for ((i=0; i < $TIMEOUT; i++)); do - if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then + if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then return 0 fi sleep 1 diff --git a/qa/standalone/scrub/osd-scrub-repair.sh b/qa/standalone/scrub/osd-scrub-repair.sh index 2af4ea799634..2a8f7ce9576d 100755 --- a/qa/standalone/scrub/osd-scrub-repair.sh +++ b/qa/standalone/scrub/osd-scrub-repair.sh @@ -2535,7 +2535,8 @@ function TEST_periodic_scrub_replicated() { setup $dir || return 1 run_mon $dir a --osd_pool_default_size=2 || return 1 run_mgr $dir x || return 1 - local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0" + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 " + ceph_osd_args+="--osd_scrub_backoff_ratio=0" run_osd $dir 0 $ceph_osd_args || return 1 run_osd $dir 1 $ceph_osd_args || return 1 create_rbd_pool || return 1 @@ -2565,6 +2566,7 @@ function TEST_periodic_scrub_replicated() { # Make sure bad object found rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + flush_pg_stats local last_scrub=$(get_last_scrub_stamp $pg) # Fake a schedule scrub CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \ @@ -2581,10 +2583,10 @@ function TEST_periodic_scrub_replicated() { # Can't upgrade with this set ceph osd set nodeep-scrub # Let map change propagate to OSDs - sleep 2 + flush pg_stats + sleep 5 # Fake a schedule scrub - local last_scrub=$(get_last_scrub_stamp $pg) CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \ trigger_scrub $pg || return 1 # Wait for schedule regular scrub @@ -2601,12 +2603,9 @@ function TEST_periodic_scrub_replicated() { # Bad object still known rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + flush_pg_stats # Request a regular scrub and it will be done - local scrub_backoff_ratio=$(get_config osd ${primary} osd_scrub_backoff_ratio) - set_config osd ${primary} osd_scrub_backoff_ratio 0 pg_scrub $pg - sleep 1 - set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1 # deep-scrub error is no longer present -- 2.47.3