From: Loic Dachary Date: Mon, 21 Nov 2016 09:32:17 +0000 (+0100) Subject: tests: fix ceph-helpers.sh wait_for_clean delays X-Git-Tag: v11.1.0~229^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F12095%2Fhead;p=ceph.git tests: fix ceph-helpers.sh wait_for_clean delays The TENTH_TIMEOUT was not delcared as an int and failed to be set with the correct number. The test of the function did not catch this. Implement computing of the increasingly large sleep delays in a separate function so that it can be tested more easily. Give up on sub-second sleep because a the function will not sleep at all if the cluster is already clean. And if it is not already clean, it is very unlikely to become clean within less than a second. The downside of having very short sleep time is that it needlessly stress the machine and also possibly spam the logs. Refs: http://tracker.ceph.com/issues/17830 Signed-off-by: Loic Dachary --- diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh index d309001dd802..01c8de166b70 100755 --- a/qa/workunits/ceph-helpers.sh +++ b/qa/workunits/ceph-helpers.sh @@ -1071,6 +1071,41 @@ function test_is_clean() { ####################################################################### +## +# Return a list of numbers that are increasingly larger and whose +# total is **timeout** seconds. It can be used to have short sleep +# delay while waiting for an event on a fast machine. But if running +# very slowly the larger delays avoid stressing the machine even +# further or spamming the logs. +# +# @param timeout sum of all delays, in seconds +# @return a list of sleep delays +# +function get_timeout_delays() { + local -i timeout=$1 + local -i first_step=1 + + local -i i + local -i total=0 + for (( i = $first_step ; $total + $i <= $timeout ; i *= 2 )) ; do + echo -n "$i " + total+=$i + done + if (( $total < $timeout )) ; then + echo -n $(( $timeout - $total )) + fi +} + +function test_get_timeout_delays() { + test "$(get_timeout_delays 1)" = "1 " || return 1 + test "$(get_timeout_delays 5)" = "1 2 2" || return 1 + test "$(get_timeout_delays 6)" = "1 2 3" || return 1 + test "$(get_timeout_delays 7)" = "1 2 4 " || return 1 + test "$(get_timeout_delays 8)" = "1 2 4 1" || return 1 +} + +####################################################################### + ## # Wait until the cluster becomes clean or if it does not make progress # for $TIMEOUT seconds. @@ -1080,18 +1115,12 @@ function test_is_clean() { # @return 0 if the cluster is clean, 1 otherwise # function wait_for_clean() { - local status=1 local num_active_clean=-1 local cur_active_clean + local -a delays=($(get_timeout_delays $TIMEOUT)) local -i timer=0 - local -i loops=0 - local -i phase=0 local num_pgs=$(get_num_pgs) test $num_pgs != 0 || return 1 - local TENTH_TIMEOUT=($TIMEOUT * 10) - # The first phase 10 times (1 second) second phase an additional 15 times (15 seconds) - local backoff_phases=( 10 15 -1 ) - local sleep_backoff=( .1 1 10) while true ; do # Comparing get_num_active_clean & get_num_pgs is used to determine @@ -1100,21 +1129,16 @@ function wait_for_clean() { cur_active_clean=$(get_num_active_clean) test $cur_active_clean = $num_pgs && break if test $cur_active_clean != $num_active_clean ; then - timer=0 ; loops=0 ; phase=0 + timer=0 num_active_clean=$cur_active_clean elif get_is_making_recovery_progress ; then - timer=0 ; loops=0 ; phase=0 - elif (( timer >= $TENTH_TIMEOUT)) ; then + timer=0 + elif (( $timer >= ${#delays[*]} )) ; then ceph report return 1 fi - sleep ${sleep_backoff[$phase]} - timer="$(echo $timer + ${sleep_backoff[$phase]} \* 10 | bc | cut -d. -f1)" - loops=$(expr $loops + 1) - if (( $loops == ${backoff_phases[$phase]} )); then - phase=$(expr $phase + 1) - loop=0 - fi + sleep ${delays[$timer]} + timer+=1 done return 0 }