From a269bb71885f239cb11a2620e11e527d7fff6b38 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Piotr=20Da=C5=82ek?= Date: Fri, 14 Oct 2016 04:42:18 +0200 Subject: [PATCH] test/osd/osd-fast-mark-down.sh: introduce large timeout MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit One second might be not enough for loaded system to fully process the fast mark down cycle, so introduce a loop that checks for OSD to be marked as down within 30 seconds, later that can be extended (or shortened) as necessary. Fixes: http://tracker.ceph.com/issues/17918 Signed-off-by: Piotr Dałek --- src/test/osd/osd-fast-mark-down.sh | 36 ++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/src/test/osd/osd-fast-mark-down.sh b/src/test/osd/osd-fast-mark-down.sh index 77fcc861f98..57b9a28c1d1 100755 --- a/src/test/osd/osd-fast-mark-down.sh +++ b/src/test/osd/osd-fast-mark-down.sh @@ -18,6 +18,7 @@ source $(dirname $0)/../detect-build-env-vars.sh source $CEPH_ROOT/qa/workunits/ceph-helpers.sh +MAX_PROPAGATION_TIME=30 function run() { local dir=$1 @@ -62,7 +63,7 @@ function test_fast_kill() { killid=0 previd=0 - # kill random osd and see if 1 sec after, the osd count decreased. + # kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased. for i in {1..2}; do while [ $killid -eq $previd ]; do killid=${pids[$RANDOM%${#pids[@]}]} @@ -70,20 +71,37 @@ function test_fast_kill() { previd=$killid kill -9 $killid - sleep 1 + time_left=$MAX_PROPAGATION_TIME + down_osds=0 + + while [ $time_left -gt 0 ]; do + sleep 1 + time_left=$[$time_left - 1]; + + grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null + if [ $? -ne 0 ]; then + continue + fi + + down_osds=$(ceph osd tree | grep -c down) + if [ $down_osds -lt $i ]; then + # osds not marked down yet, try again in a second + continue + elif [ $down_osds -gt $i ]; then + echo Too many \($down_osds\) osds died! + teardown $dir + return 1 + else + break + fi + done - down_osds=$(ceph osd tree | grep -c down) if [ $down_osds -lt $i ]; then echo Killed the OSD, yet it is not marked down ceph osd tree - teardown $dir - return 1 - elif [ $down_osds -gt $i ]; then - echo Too many \($down_osds\) osds died! - teardown $dir + teardown $dir return 1 fi - done pkill -SIGTERM rados teardown $dir || return 1 -- 2.39.5