From d2584d6c290d2426c8ecf8f4a274dca3eab40f8a Mon Sep 17 00:00:00 2001 From: Vallari Agrawal Date: Fri, 27 Jun 2025 15:18:02 +0530 Subject: [PATCH] qa: Improve scalability test Improve logs of scalablity script. And DEBUG mode in comments, we can use it when needed. Signed-off-by: Vallari Agrawal --- .../basic/workloads/nvmeof_scalability.yaml | 12 ++++++ .../10-subsys-90-namespace-no_huge_pages.yaml | 2 +- qa/workunits/nvmeof/basic_tests.sh | 2 +- qa/workunits/nvmeof/fio_test.sh | 2 +- qa/workunits/nvmeof/scalability_test.sh | 37 +++++++++++++------ 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml index 236d62dc940..efbd66d8125 100644 --- a/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml +++ b/qa/suites/nvmeof/basic/workloads/nvmeof_scalability.yaml @@ -13,6 +13,18 @@ tasks: - cephadm.wait_for_service: service: nvmeof.mypool.mygroup0 +## Enable gateway in DEBUG mode +# - cephadm.exec: +# host.a: +# - ceph orch ls nvmeof --export > /tmp/nvmeof-orig.yaml +# - "sed 's/log_level:\ INFO/log_level:\ DEBUG/g' /tmp/nvmeof-orig.yaml > /tmp/nvmeof-debug.yaml" +# - cat /tmp/nvmeof-debug.yaml +# - ceph orch apply -i /tmp/nvmeof-debug.yaml +# - ceph orch redeploy nvmeof.mypool.mygroup0 + +# - cephadm.wait_for_service: +# service: nvmeof.mypool.mygroup0 + - workunit: no_coverage_and_limits: true timeout: 30m diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml index d2da1d0877a..30f7e583403 100644 --- a/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml +++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/10-subsys-90-namespace-no_huge_pages.yaml @@ -19,7 +19,7 @@ tasks: - cp /tmp/nvmeof-orig.yaml /tmp/nvmeof-no-huge-page.yaml - "sed -i '/ pool: mypool/a\\ spdk_mem_size: 4096' /tmp/nvmeof-no-huge-page.yaml" - cat /tmp/nvmeof-no-huge-page.yaml - - ceph orch ls --refresh + - ceph orch ls - ceph orch apply -i /tmp/nvmeof-no-huge-page.yaml - ceph orch redeploy nvmeof.mypool.mygroup0 diff --git a/qa/workunits/nvmeof/basic_tests.sh b/qa/workunits/nvmeof/basic_tests.sh index b7483787f95..e4670b8baea 100755 --- a/qa/workunits/nvmeof/basic_tests.sh +++ b/qa/workunits/nvmeof/basic_tests.sh @@ -65,7 +65,7 @@ test_run() { echo "[nvmeof] $1 test failed!" sudo nvme list-subsys sudo nvme list - sudo dmesg -T > $TESTDIR/archive/dmesg.log + sudo dmesg -T > $TESTDIR/archive/dmesg-basic_tests.log exit 1 fi } diff --git a/qa/workunits/nvmeof/fio_test.sh b/qa/workunits/nvmeof/fio_test.sh index 7aa26d973b6..3066f5c863d 100755 --- a/qa/workunits/nvmeof/fio_test.sh +++ b/qa/workunits/nvmeof/fio_test.sh @@ -94,7 +94,7 @@ status_log() { sudo nvme list-subsys /dev/$device sudo nvme id-ns /dev/$device done - + sudo dmesg -T > $TESTDIR/archive/dmesg-fio_tests.log } diff --git a/qa/workunits/nvmeof/scalability_test.sh b/qa/workunits/nvmeof/scalability_test.sh index d83cbcdd0e4..85e88032244 100755 --- a/qa/workunits/nvmeof/scalability_test.sh +++ b/qa/workunits/nvmeof/scalability_test.sh @@ -8,12 +8,24 @@ GROUP="${NVMEOF_GROUP:-mygroup0}" source /etc/ceph/nvmeof.env if [ -z "$GATEWAYS" ]; then - echo "At least one gateway needs to be defined for scalability test" + echo "[nvmeof.scale] At least one gateway needs to be defined for scalability test" exit 1 fi status_checks() { + status_checks_ $1 + if [ $? -eq 0 ]; then + echo "[nvmeof.scale] Verified successfully that everything is working with $1 gateways" + else + echo "[nvmeof.scale] Verification failed!" + sudo dmesg -T > $TESTDIR/archive/dmesg-scalability_test.log + exit 1 + fi +} + +status_checks_() { expected_count=$1 + echo "[nvmeof.scale] Verifying that everything is working with $expected_count gateways" output=$(ceph nvme-gw show $POOL $GROUP) # nvme_show=$(echo $output | grep -o '"AVAILABLE"' | wc -l) @@ -40,25 +52,27 @@ status_checks() { num_namespaces=$(echo "$gw" | jq '.["num-namespaces"]') if [[ "$availability" != "AVAILABLE" ]]; then - echo "Gateway $gw_id is not AVAILABLE." - exit 1 + echo "[nvmeof.scale] Gateway $gw_id is not AVAILABLE." + return 1 fi diff=$((num_namespaces - expected_avg_ns)) if [[ $diff -lt -1 || $diff -gt 1 ]]; then - echo "Gateway $gw_id has num-namespaces ($num_namespaces), expected around $expected_ns_count. Indicates a problem in ns load-balancing." - exit 1 + echo "[nvmeof.scale] Gateway $gw_id has num-namespaces ($num_namespaces), expected around $expected_avg_ns. Indicates a problem in ns load-balancing." + return 1 fi done orch_ls=$(ceph orch ls) if ! echo "$orch_ls" | grep -q "$expected_count/$expected_count"; then + echo "[nvmeof.scale] Expected $expected_count running gateways in 'ceph orch ls'" return 1 fi output=$(ceph orch ps --service-name nvmeof.$POOL.$GROUP) orch_ps=$(echo $output | grep -o 'running' | wc -l) if [ "$orch_ps" -ne "$expected_count" ]; then + echo "[nvmeof.scale] Expected $expected_count running gateways in 'ceph orch ps', but found $orch_ps" return 1 fi @@ -69,7 +83,7 @@ total_gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc scaled_down_gateways_count=$(( total_gateways_count - $(echo "$GATEWAYS" | tr -cd ',' | wc -c) - 1 )) -echo "[nvmeof.scale] Setting up config to remove gateways ${GATEWAYS}" +echo "[nvmeof.scale] SCALE DOWN: Setting up config to remove gateways ${GATEWAYS}" ceph orch ls --service-name nvmeof.$POOL.$GROUP --export > /tmp/nvmeof-gw.yaml ceph orch ls nvmeof --export > /tmp/nvmeof-gw.yaml cat /tmp/nvmeof-gw.yaml @@ -78,16 +92,17 @@ pattern=$(echo $GATEWAYS | sed 's/,/\\|/g') sed "/$pattern/d" /tmp/nvmeof-gw.yaml > /tmp/nvmeof-gw-new.yaml cat /tmp/nvmeof-gw-new.yaml -echo "[nvmeof.scale] Starting scale testing by removing ${GATEWAYS}" +echo "[nvmeof.scale] SCALE DOWN: Starting scale testing by removing ${GATEWAYS}" status_checks $total_gateways_count ceph orch apply -i /tmp/nvmeof-gw-new.yaml # downscale -ceph orch redeploy nvmeof.$POOL.$GROUP +# ceph orch redeploy nvmeof.$POOL.$GROUP sleep $DELAY status_checks $scaled_down_gateways_count -echo "[nvmeof.scale] Downscale complete - removed gateways (${GATEWAYS}); now scaling back up" +echo "[nvmeof.scale] SCALE DOWN successful! Removed gateways (${GATEWAYS}) and verified;" +echo "[nvmeof.scale] SCALE UP: scaling up to $total_gateways_count gateways (from $scaled_down_gateways_count gateways)" ceph orch apply -i /tmp/nvmeof-gw.yaml #upscale -ceph orch redeploy nvmeof.$POOL.$GROUP +# ceph orch redeploy nvmeof.$POOL.$GROUP sleep $DELAY status_checks $total_gateways_count - +echo "[nvmeof.scale] SCALE UP successful! All gateways running and verified." echo "[nvmeof.scale] Scale testing passed for ${GATEWAYS}" -- 2.39.5