From: Kamoltat Sirivadhna Date: Thu, 26 Jun 2025 19:58:03 +0000 (+0000) Subject: qa/workunits/rados/test.sh: add timeout mechanism to RADOS parallel tests X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ec3de7522023729d496eb9c14a259c62f6350d99;p=ceph.git qa/workunits/rados/test.sh: add timeout mechanism to RADOS parallel tests This commit adds a 90-minute timeout (configurable) for each test in the RADOS test suite when running in parallel mode. Previously, if a test hung, the entire test script would hang indefinitely, causing teutholgoy jobs to time out without useful diagnostic information. The implementation: Adds a timeout check in the process monitoring loop Forcibly terminates tests that run longer than 90 minutes (or whatever is configured) Ensures wait is only called after a process has ended or been killed Improves error reporting to identify which specific test caused the issue This change prevents indefinite hanging of the test script and provides better visibility into which test is problematic when timeouts occur. Fixes: https://tracker.ceph.com/issues/70772 Signed-off-by: Kamoltat Sirivadhna --- diff --git a/qa/workunits/rados/test.sh b/qa/workunits/rados/test.sh index 5256bd82d06..bf9fb3baa76 100755 --- a/qa/workunits/rados/test.sh +++ b/qa/workunits/rados/test.sh @@ -1,14 +1,39 @@ #!/usr/bin/env bash set -ex -parallel=1 -[ "$1" = "--serial" ] && parallel=0 +# ./test.sh # Default: parallel mode, 30-min timeout +# ./test.sh --serial # Serial mode, 30-min timeout +# ./test.sh --crimson # Crimson mode, 30-min timeout +# ./test.sh --timeout 3600 # Parallel mode, 60-min timeout +# ./test.sh --serial --timeout 60 # Serial mode, 1-min timeout +# ./test.sh --crimson --timeout 0 # Crimson mode, no timeout -# let crimson run in serial mode +# First argument must be either --serial or --crimson or nothing +parallel=1 crimson=0 -[ "$1" = "--crimson" ] && parallel=0 && crimson=1 +if [ "$1" = "--serial" ]; then + parallel=0 + shift # Remove the first argument from the list so timeout can be processed next +elif [ "$1" = "--crimson" ]; then + parallel=0 + crimson=1 + shift +fi + +# After processing the first arg, check for --timeout +timeout=1800 # 30 minutes default value +if [ "$1" = "--timeout" ]; then + shift + if [ -n "$1" ] && [[ "$1" =~ ^[0-9]+$ ]]; then + timeout=$1 + shift # Remove the timeout value from the list so color can be processed next + else + echo "Invalid or missing timeout value after --timeout. Must be a number." + exit 1 + fi +fi -color="" +color="" # Default color setting for gtest in terminal (-t) [ -t 1 ] && color="--gtest_color=yes" function cleanup() { @@ -78,12 +103,35 @@ ret=0 if [ $parallel -eq 1 ]; then for t in "${!pids[@]}" do - pid=${pids[$t]} - if ! wait $pid - then - echo "error in $t ($pid)" - ret=1 - fi + # Set timeout values + max_wait=$timeout + waited=0 + check_interval=10 + pid=${pids[$t]} + echo "Checking Test $t (PID $pid)..." + # Check in a loop with timeout + # kill -0 checks if the process is running + # and 2 >/dev/null suppresses error messages if the process is not found + while kill -0 $pid 2>/dev/null; do + sleep $check_interval + waited=$((waited + check_interval)) + echo "Waiting for test $t (PID $pid)... waited $waited seconds" + if [ $waited -ge $max_wait ]; then + # Process timed out + echo "ERROR: Test $t ($pid) - TIMED OUT after $max_wait seconds" + kill -9 $pid 2>/dev/null || true + ret=1 + break + fi + done + # Only wait after process has ended naturally or been killed + # We only call wait after determining that the process is no longer running + # So this won't hang indefinitely like https://tracker.ceph.com/issues/70772 + wait $pid 2>/dev/null || { + echo "ERROR: Test $t (PID $pid) failed with non-zero exit status" + echo "Check the logs for failures in $t" + ret=1 + } done fi