From: Kamoltat Sirivadhna <ksirivad@redhat.com>
Date: Wed, 2 Jul 2025 20:20:39 +0000 (+0000)
Subject: qa/workunits/rados/test.sh: make --serial work with --timeout
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bfa6edb0ea2827919d77f94d59ccee0425eb510d;p=ceph.git

qa/workunits/rados/test.sh: make --serial work with --timeout

- For --serial we use timeout function and --timeout
value to to reutrn exit 1 if test exceeds --timeout.
- Update documentation for command-line arguments
such that user understands the order of argument matters.
- Use test arrays for better organization of test definitions
- Add test type tracking to properly handle both rados
and neorados tests when falling back on XML
- Change timeout default value to 90 minutes

Fixes: https://tracker.ceph.com/issues/70772

Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
---

diff --git a/qa/workunits/rados/test.sh b/qa/workunits/rados/test.sh
index 6593adb0e4d..d954a24b066 100755
--- a/qa/workunits/rados/test.sh
+++ b/qa/workunits/rados/test.sh
@@ -1,12 +1,47 @@
 #!/usr/bin/env bash
 set -ex
-
-# ./test.sh                               # Default: parallel mode, 30-min timeout
-# ./test.sh --serial                      # Serial mode, 30-min timeout
-# ./test.sh --crimson                     # Crimson mode, 30-min timeout
-# ./test.sh --timeout 3600                # Parallel mode, 60-min timeout
-# ./test.sh --serial --timeout 60         # Serial mode, 1-min timeout
-# ./test.sh --crimson --timeout 0         # Crimson mode, no timeout
+# This script runs the RADOS API tests in parallel or serial mode, with optional --timeout for each test.
+# It can also be run in a vstart environment for local testing.
+
+# Define test arrays for better organization
+RADOS_TESTS=(
+    api_aio api_aio_pp
+    api_io api_io_pp
+    api_asio api_list
+    api_lock api_lock_pp
+    api_misc api_misc_pp
+    api_tier_pp
+    api_pool
+    api_snapshots api_snapshots_pp
+    api_stat api_stat_pp
+    api_watch_notify api_watch_notify_pp
+    api_cmd api_cmd_pp
+    api_service api_service_pp
+    api_c_write_operations
+    api_c_read_operations
+    list_parallel
+    open_pools_parallel
+    delete_pools_parallel
+)
+
+NEORADOS_TESTS=(
+    cls cmd handler_error io ec_io list ec_list misc pool
+    read_operations snapshots watch_notify write_operations
+)
+
+# Note on argument ordering: This script processes arguments sequentially,
+# so the order matters. Arguments must be provided in this specific sequence:
+# 1. --serial OR --crimson (optional)
+# 2. --timeout VALUE (optional) (for each test)
+# 3. --vstart (optional)
+#
+# For example:
+# ./test.sh                               # Default: parallel mode, 90-min timeout for each test
+# ./test.sh --serial                      # Serial mode, 90-min timeout for each test
+# ./test.sh --crimson                     # Crimson mode, 90-min timeout for each test
+# ./test.sh --timeout 3600                # Parallel mode, 60-min timeout for each test
+# ./test.sh --serial --timeout 60         # Serial mode, 1-min timeout for each test
+# ./test.sh --crimson --timeout 0         # Crimson mode, no timeout for each test
 # ../qa/workunits/rados/test.sh --vstart  # Run tests locally from `ceph/build` dir
 
 # First argument must be either --serial or --crimson or nothing
@@ -22,10 +57,11 @@ elif [ "$1" = "--crimson" ]; then
 fi
 
 # After processing the first arg, check for --timeout
-timeout=1800  # 30 minutes default value
+timeout=5400  # 90 minutes default value
 if [ "$1" = "--timeout" ]; then
     shift
     if [ -n "$1" ] && [[ "$1" =~ ^[0-9]+$ ]]; then
+        echo "Setting timeout to $1 seconds for each test"
         timeout=$1
         shift # Remove the timeout value from the list so color can be processed next
     else
@@ -38,7 +74,10 @@ color="" # Default color setting for gtest in terminal (-t)
 [ -t 1 ] && color="--gtest_color=yes"
 
 vstart=0
-[ "$1" = "--vstart" ] && vstart=1
+if [ "$1" = "--vstart" ]; then
+    vstart=1
+    shift
+fi
 
 function cleanup() {
     pkill -P $$ || true
@@ -49,35 +88,17 @@ GTEST_OUTPUT_DIR=${TESTDIR:-$(mktemp -d)}/archive/unit_test_xml_report
 mkdir -p $GTEST_OUTPUT_DIR
 
 declare -A pids
+declare -A test_type
+ret=0
 
-
-# If in vstart mode, compile all test targets
+# If in vstart mode, compile all test targets and start a vstart cluster.
 if [ $vstart -eq 1 ]; then
-    for f in \
-        api_aio api_aio_pp \
-	api_io api_io_pp \
-	api_asio api_list \
-	api_lock api_lock_pp \
-	api_misc api_misc_pp \
-	api_tier_pp \
-	api_pool \
-	api_snapshots api_snapshots_pp \
-	api_stat api_stat_pp \
-	api_watch_notify api_watch_notify_pp \
-	api_cmd api_cmd_pp \
-	api_service api_service_pp \
-	api_c_write_operations \
-	api_c_read_operations \
-	list_parallel \
-	open_pools_parallel \
-	delete_pools_parallel
+    for f in "${RADOS_TESTS[@]}";
     do
         ninja -j$(nproc) ceph_test_rados_$f
     done
 
-    for f in \
-        cls cmd handler_error io ec_io list ec_list misc pool read_operations snapshots \
-        watch_notify write_operations
+    for f in "${NEORADOS_TESTS[@]}";
     do
         ninja -j$(nproc) ceph_test_neorados_$f
     done
@@ -87,42 +108,32 @@ if [ $vstart -eq 1 ]; then
     ../src/vstart.sh --debug --new -x --localhost --bluestore
 fi
 
-# Running all tests
-for f in \
-    api_aio api_aio_pp \
-    api_io api_io_pp \
-    api_asio api_list \
-    api_lock api_lock_pp \
-    api_misc api_misc_pp \
-    api_tier_pp \
-    api_pool \
-    api_snapshots api_snapshots_pp \
-    api_stat api_stat_pp \
-    api_watch_notify api_watch_notify_pp \
-    api_cmd api_cmd_pp \
-    api_service api_service_pp \
-    api_c_write_operations \
-    api_c_read_operations \
-    list_parallel \
-    open_pools_parallel \
-    delete_pools_parallel
+# Running all tests in ceph_test_rados
+for f in "${RADOS_TESTS[@]}"
 do
     executable="ceph_test_rados_$f"
     if [ $vstart -eq 1 ]; then
         executable="./bin/$executable"
     fi
     if [ $parallel -eq 1 ]; then
-	r=`printf '%25s' $f`
-	ff=`echo $f | awk '{print $1}'`
-	bash -o pipefail -exc "$executable --gtest_output=xml:$GTEST_OUTPUT_DIR/$f.xml $color 2>&1 | tee ceph_test_rados_$ff.log | sed \"s/^/$r: /\"" &
-	pid=$!
-	echo "test $f on pid $pid"
-	pids[$f]=$pid
+        r=`printf '%25s' $f`
+        ff=`echo $f | awk '{print $1}'`
+        bash -o pipefail -exc "$executable --gtest_output=xml:$GTEST_OUTPUT_DIR/$f.xml $color 2>&1 | tee ceph_test_rados_$ff.log | sed \"s/^/$r: /\"" &
+        pid=$!
+        echo "test $f on pid $pid"
+	    pids[$f]=$pid
+        test_type["$f"]="rados" # Store test type for later use in parallel mode
     else
-	$executable
+        # If running in serial mode, run the test directly
+        if ! timeout $timeout $executable; then
+            echo "ERROR: Test $f timed out after $timeout seconds"
+            echo "Check the logs for failures in $f"
+            ret=1
+        fi
     fi
 done
 
+# Running all tests in ceph_test_neorados
 for f in \
     cls cmd handler_error io ec_io list ec_list misc pool read_operations snapshots \
     watch_notify write_operations
@@ -132,28 +143,33 @@ do
         executable="./bin/$executable"
     fi
     if [ $parallel -eq 1 ]; then
-	r=`printf '%25s' $f`
-	ff=`echo $f | awk '{print $1}'`
-	bash -o pipefail -exc "$executable --gtest_output=xml:$GTEST_OUTPUT_DIR/neorados_$f.xml $color 2>&1 | tee ceph_test_neorados_$ff.log | sed \"s/^/$r: /\"" &
-	pid=$!
-	echo "test $f on pid $pid"
-	pids[$f]=$pid
+        r=`printf '%25s' $f`
+        ff=`echo $f | awk '{print $1}'`
+        bash -o pipefail -exc "$executable --gtest_output=xml:$GTEST_OUTPUT_DIR/neorados_$f.xml $color 2>&1 | tee ceph_test_neorados_$ff.log | sed \"s/^/$r: /\"" &
+        pid=$!
+        echo "test $f on pid $pid"
+        pids[$f]=$pid
+        test_type["$f"]="neorados" # Store test type for later use in parallel mode
     else
-	if [ $crimson -eq 1 ]; then
-		if [ $f = "ec_io" ] || [ $f = "ec_list" ]; then
-			echo "Skipping EC with Crimson"
-			continue
-		fi
-	fi
-	$executable
+        if [ $crimson -eq 1 ]; then
+            if [ $f = "ec_io" ] || [ $f = "ec_list" ]; then
+                echo "Skipping EC with Crimson"
+                continue
+            fi
+        fi
+        # If running in serial mode, run the test directly
+        if ! timeout $timeout $executable; then
+            echo "ERROR: Test $f timed out after $timeout seconds"
+            echo "Check the logs for failures in $f"
+            ret=1
+        fi
     fi
 done
 
-ret=0
 if [ $parallel -eq 1 ]; then
 for t in "${!pids[@]}"
 do
-    # Set timeout values
+    # Set timeout value for each test
     max_wait=$timeout
     waited=0
     check_interval=10
@@ -167,17 +183,16 @@ do
         waited=$((waited + check_interval))
         echo "Waiting for test $t (PID $pid)... waited $waited seconds"
         if [ $waited -ge $max_wait ]; then
-        # Process timed out
-        echo "ERROR: Test $t ($pid) - TIMED OUT after $max_wait seconds"
-
-        # Create fallback XML file
-        xml_path="$GTEST_OUTPUT_DIR/$t.xml"
-        if [[ $t == neorados_* ]]; then
-            xml_path="$GTEST_OUTPUT_DIR/neorados_$t.xml"
-        fi
-
-        echo "Creating fallback XML report at $xml_path"
-        cat > "$xml_path" << EOF
+            # Process timed out
+            echo "ERROR: Test $t ($pid) - TIMED OUT after $max_wait seconds"
+            # Create fallback XML file
+            if [ "${test_type[$t]}" = "neorados" ]; then
+                xml_path="$GTEST_OUTPUT_DIR/neorados_$t.xml"
+            else
+                xml_path="$GTEST_OUTPUT_DIR/$t.xml"
+            fi
+            echo "Creating fallback XML report at $xml_path"
+            cat > "$xml_path" << EOF
 <?xml version="1.0" encoding="UTF-8"?>
 <testsuites tests="1" failures="1" disabled="0" errors="0" time="${max_wait}.000" timestamp="$(date -Iseconds)" name="AllTests">
   <testsuite name="Timeout" tests="1" failures="1" disabled="0" errors="0" time="${max_wait}.000">
@@ -191,9 +206,9 @@ do
   </testsuite>
 </testsuites>
 EOF
-        kill -9 $pid 2>/dev/null || true
-        ret=1
-        break
+            kill -9 $pid 2>/dev/null || true
+            ret=1
+            break
         fi
     done
     # Only wait after process has ended naturally or been killed