From 1a3cbacdb242b37bd8b532b990408cfd3321d67f Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Thu, 24 Jul 2025 17:31:12 +0000 Subject: [PATCH] qa: Add test for subvolume_ls on osd full Fixes: https://tracker.ceph.com/issues/72260 Signed-off-by: Kotresh HR (cherry picked from commit 8547e57ebc4022ca6750149f49b68599a8af712e) --- ...mds-1-osd.yaml => 1-node-4-mds-1-osd.yaml} | 2 +- .../fs/full/clusters/1-node-1-mds-1-osd.yaml | 1 - .../fs/full/clusters/1-node-4-mds-1-osd.yaml | 1 + qa/suites/fs/full/overrides.yaml | 1 + qa/suites/fs/full/tasks/mgr-osd-full.yaml | 5 + qa/workunits/fs/full/subvolume_ls.sh | 119 ++++++++++++++++++ 6 files changed, 127 insertions(+), 2 deletions(-) rename qa/cephfs/clusters/{1-node-1-mds-1-osd.yaml => 1-node-4-mds-1-osd.yaml} (65%) delete mode 120000 qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml create mode 120000 qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml create mode 100755 qa/workunits/fs/full/subvolume_ls.sh diff --git a/qa/cephfs/clusters/1-node-1-mds-1-osd.yaml b/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml similarity index 65% rename from qa/cephfs/clusters/1-node-1-mds-1-osd.yaml rename to qa/cephfs/clusters/1-node-4-mds-1-osd.yaml index 865b976c699..dc3e10681e7 100644 --- a/qa/cephfs/clusters/1-node-1-mds-1-osd.yaml +++ b/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml @@ -1,5 +1,5 @@ roles: -- [mon.a, mgr.x, mds.a, osd.0, client.0] +- [mon.a, mgr.x, mds.a, mds.b, mds.c, mds.d, osd.0, client.0] openstack: - volumes: # attached to each instance count: 1 diff --git a/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml deleted file mode 120000 index 517b76547e9..00000000000 --- a/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/cephfs/clusters/1-node-1-mds-1-osd.yaml \ No newline at end of file diff --git a/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml new file mode 120000 index 00000000000..95633a09f9d --- /dev/null +++ b/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml @@ -0,0 +1 @@ +.qa/cephfs/clusters/1-node-4-mds-1-osd.yaml \ No newline at end of file diff --git a/qa/suites/fs/full/overrides.yaml b/qa/suites/fs/full/overrides.yaml index 921528d66a5..afaab9b5a8d 100644 --- a/qa/suites/fs/full/overrides.yaml +++ b/qa/suites/fs/full/overrides.yaml @@ -17,3 +17,4 @@ overrides: - OSD_OUT_OF_ORDER_FULL - OSD_NEARFULL - OSD_FULL + - MGR_DOWN diff --git a/qa/suites/fs/full/tasks/mgr-osd-full.yaml b/qa/suites/fs/full/tasks/mgr-osd-full.yaml index a005f52037c..ff5e1adc06d 100644 --- a/qa/suites/fs/full/tasks/mgr-osd-full.yaml +++ b/qa/suites/fs/full/tasks/mgr-osd-full.yaml @@ -29,3 +29,8 @@ tasks: clients: client.0: - fs/full/subvolume_snapshot_rm.sh +- workunit: + cleanup: true + clients: + client.0: + - fs/full/subvolume_ls.sh diff --git a/qa/workunits/fs/full/subvolume_ls.sh b/qa/workunits/fs/full/subvolume_ls.sh new file mode 100755 index 00000000000..7975c4061e2 --- /dev/null +++ b/qa/workunits/fs/full/subvolume_ls.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +set -ex + +# This testcase tests the scenario of the 'ceph fs subvolume ls' mgr command +# when the osd is full. The command used to miss out few subvolumes in the list. +# The issue happens in the multi-mds active setup. Please see the tracker +# https://tracker.ceph.com/issues/72260 + +# The suite sets the 'bluestore block size' to 2GiB. So, the osd is of the +# size 2GiB. The 25 subvolumes are created and a 1GB file is written on the +# root. The full-ratios are set such that, the data less than 500MB is +# treated as osd full. Now, subvolumes are listed 20 times with mgr failover +# (to invalidate readdir cache) and validated each time. + +SUBVOL_CNT=25 + +expect_failure() { + if "$@"; then return 1; else return 0; fi +} +validate_subvol_cnt() { + if [ $1 -eq $SUBVOL_CNT ]; then return 0; else return 1; fi +} +restart_mgr() { + ceph mgr fail x + timeout=30 + while [ $timeout -gt 0 ] + do + active_mgr_cnt=$(ceph status | grep mgr | grep active | grep -v no | wc -l) + if [ $active_mgr_cnt -eq 1 ]; then break; fi + echo "Waiting for mgr to be active after failover: $timeout" + sleep 1 + let "timeout-=1" + done +} + +#Set client_use_random_mds +ceph config set client client_use_random_mds true + +#Set max_mds to 3 +ceph fs set cephfs max_mds 3 +timeout=30 +while [ $timeout -gt 0 ] +do + active_cnt=$(ceph fs status | grep active | wc -l) + if [ $active_cnt -eq 2 ]; then break; fi + echo "Wating for max_mds to be 2: $timeout" + sleep 1 + let "timeout-=1" +done + +#Create subvolumes +for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume create cephfs sub_$i; done + +#For debugging +echo "Before write" +df -h +ceph osd df + +sudo dd if=/dev/urandom of=$CEPH_MNT/1GB_file-1 status=progress bs=1M count=1000 + +# The suite (qa/suites/fs/full/tasks/mgr-osd-full.yaml) sets the 'bluestore block size' +# to 2GiB. So, the osd is of the size 2GiB. The full-ratios are set below makes sure +# that the data less than 500MB is treated as osd full. +ceph osd set-full-ratio 0.2 +ceph osd set-nearfull-ratio 0.16 +ceph osd set-backfillfull-ratio 0.18 + +timeout=30 +while [ $timeout -gt 0 ] +do + health=$(ceph health detail) + [[ $health = *"OSD_FULL"* ]] && echo "OSD is full" && break + echo "Waiting for osd to be full: $timeout" + sleep 1 + let "timeout-=1" +done + +#For debugging +echo "After ratio set" +df -h +ceph osd df + +#Clear readdir cache by failing over mgr which forces to use new libcephfs connection +#Validate subvolume ls 20 times +for i in {1..20}; +do + restart_mgr + #List and validate subvolumes count + subvol_cnt=$(ceph fs subvolume ls cephfs --format=json-pretty | grep sub_ | wc -l) + validate_subvol_cnt $subvol_cnt +done + +#Delete all subvolumes +for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume rm cephfs sub_$i; done + +#Wait for subvolume to delete data +trashdir=$CEPH_MNT/volumes/_deleting +timeout=30 +while [ $timeout -gt 0 ] +do + [ -z "$(sudo ls -A $trashdir)" ] && echo "Trash directory $trashdir is empty" && break + echo "Waiting for trash dir to be empty: $timeout" + sleep 1 + let "timeout-=1" +done + +sudo rm -f $CEPH_MNT/1GB_file-1 + +#Set the ratios back for other full tests to run +ceph osd set-full-ratio 0.95 +ceph osd set-nearfull-ratio 0.95 +ceph osd set-backfillfull-ratio 0.95 + +#After test +echo "After test" +df -h +ceph osd df + +echo OK -- 2.39.5