tests: fix race condition testing auto scrub

author Loic Dachary <ldachary@redhat.com>

Sat, 28 Nov 2015 16:55:46 +0000 (17:55 +0100)

committer Loic Dachary <ldachary@redhat.com>

Sun, 29 Nov 2015 11:17:16 +0000 (12:17 +0100)
author Loic Dachary <ldachary@redhat.com>
Sat, 28 Nov 2015 16:55:46 +0000 (17:55 +0100)
committer Loic Dachary <ldachary@redhat.com>
Sun, 29 Nov 2015 11:17:16 +0000 (12:17 +0100)
diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh

index 05b1a678fb035f2bc27401de2a09c715b02e4692..39eb54023ef7cb2afcac32c12adf92b254df77b6 100755 (executable)
--- a/qa/workunits/ceph-helpers.sh
+++ b/qa/workunits/ceph-helpers.sh
@@ -1019,9 +1019,7 @@ function test_wait_for_clean() {
  ##
  # Run repair on **pgid** and wait until it completes. The repair
  # function will fail if repair does not complete within $TIMEOUT
-# seconds. The repair is complete whenever the
-# **get_last_scrub_stamp** function reports a timestamp different from
-# the one stored before starting the repair.
+# seconds.
  #
  # @param pgid the id of the PG
  # @return 0 on success, 1 on error
@@ -1029,15 +1027,8 @@ function test_wait_for_clean() {
  function repair() {
      local pgid=$1
      local last_scrub=$(get_last_scrub_stamp $pgid)
-
      ceph pg repair $pgid
-    for ((i=0; i < $TIMEOUT; i++)); do
-        if test "$last_scrub" != "$(get_last_scrub_stamp $pgid)" ; then
-            return 0
-        fi
-        sleep 1
-    done
-    return 1
+    wait_for_scrub $pgid "$last_scrub"
  }
  
  function test_repair() {
@@ -1106,6 +1097,48 @@ function test_expect_failure() {
  
  #######################################################################
  
+##
+# Given the *last_scrub*, wait for scrub to happen on **pgid**.  It
+# will fail if scrub does not complete within $TIMEOUT seconds. The
+# repair is complete whenever the **get_last_scrub_stamp** function
+# reports a timestamp different from the one given in argument.
+#
+# @param pgid the id of the PG
+# @param last_scrub timestamp of the last scrub for *pgid*
+# @return 0 on success, 1 on error
+#
+function wait_for_scrub() {
+    local pgid=$1
+    local last_scrub="$2"
+
+    for ((i=0; i < $TIMEOUT; i++)); do
+        if test "$last_scrub" != "$(get_last_scrub_stamp $pgid)" ; then
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+function test_wait_for_scrub() {
+    local dir=$1
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_osd $dir 0 || return 1
+    wait_for_clean || return 1
+    local pgid=1.0
+    ceph pg repair $pgid
+    local last_scrub=$(get_last_scrub_stamp $pgid)
+    wait_for_scrub $pgid "$last_scrub" || return 1
+    kill_daemons $dir KILL osd || return 1
+    last_scrub=$(get_last_scrub_stamp $pgid)
+    ! TIMEOUT=1 wait_for_scrub $pgid "$last_scrub" || return 1
+    teardown $dir || return 1
+}
+
+#######################################################################
+
  ##
  # Return 0 if the erasure code *plugin* is available, 1 otherwise.
  #
diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh

index ca329975ac4187015ac2d0d23c2b968b309c2ea1..2b22ab906b422022b9deedab9146905138c88317 100755 (executable)
--- a/src/test/osd/osd-scrub-repair.sh
+++ b/src/test/osd/osd-scrub-repair.sh
@@ -159,25 +159,24 @@ function TEST_auto_repair_erasure_coded() {
              --osd-scrub-min-interval=5 \
              --osd-scrub-interval-randomize-ratio=0
      done
-    wait_for_clean || return 1
  
      # Create an EC pool
      ceph osd erasure-code-profile set myprofile \
          k=2 m=1 ruleset-failure-domain=osd || return 1
      ceph osd pool create $poolname 8 8 erasure myprofile || return 1
-    wait_for_clean || return 1
  
      # Put an object
      local payload=ABCDEF
      echo $payload > $dir/ORIGINAL
      rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
+    wait_for_clean || return 1
  
      # Remove the object from one shard physically
      objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
-
-    # Give some time for auto repair
-    sleep 20
-
+    # Wait for auto repair
+    local pgid=$(get_pg $poolname SOMETHING)
+    wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)"
+    wait_for_clean || return 1
      # Verify - the file should be back
      objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1
      rados --pool $poolname get SOMETHING $dir/COPY || return 1
author	Loic Dachary <ldachary@redhat.com>
	Sat, 28 Nov 2015 16:55:46 +0000 (17:55 +0100)
committer	Loic Dachary <ldachary@redhat.com>
	Sun, 29 Nov 2015 11:17:16 +0000 (12:17 +0100)
qa/workunits/ceph-helpers.sh		patch \| blob \| history
src/test/osd/osd-scrub-repair.sh		patch \| blob \| history