From 94c958d2c9570e55305384ac86185e328746d2ff Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 8 Dec 2016 23:00:13 -0800 Subject: [PATCH] test: Add test for keeping deep-scrub information Signed-off-by: David Zafman (cherry picked from commit 64a7012e986ec88994c073b738fd08e8958395c3) --- qa/workunits/ceph-helpers.sh | 4 +- src/test/osd/osd-scrub-repair.sh | 103 +++++++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 7 deletions(-) diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh index 3178cd58c71c1..e62faea635ad0 100755 --- a/qa/workunits/ceph-helpers.sh +++ b/qa/workunits/ceph-helpers.sh @@ -897,6 +897,8 @@ function test_get_not_primary() { # @param STDOUT the output of ceph-objectstore-tool # @return 0 on success, 1 on error # +# The value of $ceph_osd_args will be passed to restarted osds +# function objectstore_tool() { local dir=$1 shift @@ -909,7 +911,7 @@ function objectstore_tool() { --data-path $osd_data \ --journal-path $osd_data/journal \ "$@" || return 1 - activate_osd $dir $id >&2 || return 1 + activate_osd $dir $id $ceph_osd_args >&2 || return 1 wait_for_clean >&2 } diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh index 2be0747c883db..cf06c5139472f 100755 --- a/src/test/osd/osd-scrub-repair.sh +++ b/src/test/osd/osd-scrub-repair.sh @@ -49,9 +49,16 @@ function add_something() { local dir=$1 local poolname=$2 local obj=${3:-SOMETHING} + local scrub=${4:-noscrub} - ceph osd set noscrub || return 1 - ceph osd set nodeep-scrub || return 1 + if [ "$scrub" = "noscrub" ]; + then + ceph osd set noscrub || return 1 + ceph osd set nodeep-scrub || return 1 + else + ceph osd unset noscrub || return 1 + ceph osd unset nodeep-scrub || return 1 + fi local payload=ABCDEF echo $payload > $dir/ORIGINAL @@ -173,13 +180,13 @@ function TEST_auto_repair_erasure_coded() { # Launch a cluster with 5 seconds scrub interval setup $dir || return 1 run_mon $dir a || return 1 - for id in $(seq 0 2) ; do - run_osd $dir $id \ - --osd-scrub-auto-repair=true \ + local ceph_osd_args="--osd-scrub-auto-repair=true \ --osd-deep-scrub-interval=5 \ --osd-scrub-max-interval=5 \ --osd-scrub-min-interval=5 \ - --osd-scrub-interval-randomize-ratio=0 + --osd-scrub-interval-randomize-ratio=0" + for id in $(seq 0 2) ; do + run_osd $dir $id $ceph_osd_args done wait_for_clean || return 1 @@ -195,12 +202,14 @@ function TEST_auto_repair_erasure_coded() { rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1 # Remove the object from one shard physically + # Restarted osd get $ceph_osd_args passed objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1 # Wait for auto repair local pgid=$(get_pg $poolname SOMETHING) wait_for_scrub $pgid "$(get_last_scrub_stamp $pgid)" wait_for_clean || return 1 # Verify - the file should be back + # Restarted osd get $ceph_osd_args passed objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1 rados --pool $poolname get SOMETHING $dir/COPY || return 1 diff $dir/ORIGINAL $dir/COPY || return 1 @@ -2141,6 +2150,88 @@ EOF teardown $dir || return 1 } +# +# Test to make sure that a periodic scrub won't cause deep-scrub info to be lost +# +function TEST_periodic_scrub_replicated() { + local dir=$1 + local poolname=psr_pool + local objname=POBJ + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=2 || return 1 + local ceph_osd_args="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0" + run_osd $dir 0 $ceph_osd_args || return 1 + run_osd $dir 1 $ceph_osd_args || return 1 + wait_for_clean || return 1 + + ceph osd pool create $poolname 1 1 || return 1 + wait_for_clean || return 1 + + local osd=0 + add_something $dir $poolname $objname scrub || return 1 + local primary=$(get_primary $poolname $objname) + local pg=$(get_pg $poolname $objname) + + # Add deep-scrub only error + local payload=UVWXYZ + echo $payload > $dir/CORRUPT + # Uses $ceph_osd_args for osd restart + objectstore_tool $dir $osd $objname set-bytes $dir/CORRUPT || return 1 + + # No scrub information available, so expect failure + set -o pipefail + ! rados list-inconsistent-obj $pg | jq '.' || return 1 + set +o pipefail + + pg_deep_scrub $pg || return 1 + + # Make sure bad object found + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + local last_scrub=$(get_last_scrub_stamp $pg) + # Fake a schedule scrub + CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.${primary}.asok \ + trigger_scrub $pg || return 1 + # Wait for schedule regular scrub + wait_for_scrub $pg "$last_scrub" + + # It needed to be upgraded + grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.${primary}.log || return 1 + + # Bad object still known + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + # Can't upgrade with this set + ceph osd set nodeep-scrub + + # Fake a schedule scrub + local last_scrub=$(get_last_scrub_stamp $pg) + CEPH_ARGS='' ceph --admin-daemon $dir/ceph-osd.${primary}.asok \ + trigger_scrub $pg || return 1 + # Wait for schedule regular scrub + # to notice scrub and skip it + local found=false + for i in $(seq 14 -1 0) + do + sleep 1 + ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.${primary}.log || { found=true ; break; } + echo Time left: $i seconds + done + test $found = "true" || return 1 + + # Bad object still known + rados list-inconsistent-obj $pg | jq '.' | grep -q $objname || return 1 + + # Request a regular scrub and it will be done + pg_scrub $pg + grep -q "Regular scrub request, losing deep-scrub details" $dir/osd.${primary}.log || return 1 + + # deep-scrub error is no longer present + rados list-inconsistent-obj $pg | jq '.' | grep -qv $objname || return 1 +} + + main osd-scrub-repair "$@" # Local Variables: -- 2.39.5