]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: Fix races with waiting for scrubs
authorDavid Zafman <dzafman@redhat.com>
Wed, 9 Aug 2017 15:43:57 +0000 (08:43 -0700)
committerDavid Zafman <dzafman@redhat.com>
Thu, 10 Aug 2017 19:37:05 +0000 (12:37 -0700)
The trigger_scrub sets the last_scrub_stamp backwards to
force a scheduled scrub.  In a small window this stamp could get propagated
to the mgr.  A test failure occurred because wait_for_scrub() was confused
by seeing a backward moving date.

The most critical change is having wait_for_scrub() make sure that the
date advances past the previous in value.

A test failed because the random backoff kept delayed triggered scrub, so
set osd_scrub_backoff throughout.

Signed-off-by: David Zafman <dzafman@redhat.com>
qa/standalone/ceph-helpers.sh
qa/standalone/scrub/osd-scrub-repair.sh

index 16996b377ec27e9cfc373506bc87d1537ccf402f..1afd45b8893ff26ad029276f5bb762d8fc35f72b 100755 (executable)
@@ -1619,7 +1619,7 @@ function wait_for_scrub() {
     local sname=${3:-last_scrub_stamp}
 
     for ((i=0; i < $TIMEOUT; i++)); do
-        if test "$last_scrub" != "$(get_last_scrub_stamp $pgid $sname)" ; then
+        if test "$(get_last_scrub_stamp $pgid $sname)" '>' "$last_scrub" ; then
             return 0
         fi
         sleep 1
index 2af4ea79963482d8b9250114e80b47cc84703933..2a8f7ce9576dd4f1fa85a1fbdb30a4e193cc10b9 100755 (executable)
@@ -2535,7 +2535,8 @@ function TEST_periodic_scrub_replicated() {
     setup $dir || return 1
     run_mon $dir a --osd_pool_default_size=2 || return 1
     run_mgr $dir x || return 1
-    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0"
+    local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
+    ceph_osd_args+="--osd_scrub_backoff_ratio=0"
     run_osd $dir 0 $ceph_osd_args || return 1
     run_osd $dir 1 $ceph_osd_args || return 1
     create_rbd_pool || return 1
@@ -2565,6 +2566,7 @@ function TEST_periodic_scrub_replicated() {
     # Make sure bad object found
     rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
 
+    flush_pg_stats
     local last_scrub=$(get_last_scrub_stamp $pg)
     # Fake a schedule scrub
     CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
@@ -2581,10 +2583,10 @@ function TEST_periodic_scrub_replicated() {
     # Can't upgrade with this set
     ceph osd set nodeep-scrub
     # Let map change propagate to OSDs
-    sleep 2
+    flush pg_stats
+    sleep 5
 
     # Fake a schedule scrub
-    local last_scrub=$(get_last_scrub_stamp $pg)
     CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.${primary}) \
              trigger_scrub $pg || return 1
     # Wait for schedule regular scrub
@@ -2601,12 +2603,9 @@ function TEST_periodic_scrub_replicated() {
     # Bad object still known
     rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1
 
+    flush_pg_stats
     # Request a regular scrub and it will be done
-    local scrub_backoff_ratio=$(get_config osd ${primary} osd_scrub_backoff_ratio)
-    set_config osd ${primary} osd_scrub_backoff_ratio 0
     pg_scrub $pg
-    sleep 1
-    set_config osd ${primary} osd_scrub_backoff_ratio $scrub_backoff_ratio
     grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.${primary}.log || return 1
 
     # deep-scrub error is no longer present