qa: Add test for subvolume_ls on osd full

author Kotresh HR <khiremat@redhat.com>

Thu, 24 Jul 2025 17:31:12 +0000 (17:31 +0000)

committer Jos Collin <jcollin@redhat.com>

Mon, 8 Sep 2025 05:03:41 +0000 (10:33 +0530)
author Kotresh HR <khiremat@redhat.com>
Thu, 24 Jul 2025 17:31:12 +0000 (17:31 +0000)
committer Jos Collin <jcollin@redhat.com>
Mon, 8 Sep 2025 05:03:41 +0000 (10:33 +0530)
diff --git a/qa/cephfs/clusters/1-node-1-mds-1-osd.yaml b/qa/cephfs/clusters/1-node-1-mds-1-osd.yaml

deleted file mode 100644 (file)

index 865b976..0000000
--- a/qa/cephfs/clusters/1-node-1-mds-1-osd.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-roles:
-- [mon.a, mgr.x, mds.a, osd.0, client.0]
-openstack:
-- volumes: # attached to each instance
-    count: 1
-    size: 5 # GB
-- machine:
-    disk: 10 # GB
diff --git a/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml b/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml

new file mode 100644 (file)

index 0000000..dc3e106
--- /dev/null
+++ b/qa/cephfs/clusters/1-node-4-mds-1-osd.yaml
@@ -0,0 +1,8 @@
+roles:
+- [mon.a, mgr.x, mds.a, mds.b, mds.c, mds.d, osd.0, client.0]
+openstack:
+- volumes: # attached to each instance
+    count: 1
+    size: 5 # GB
+- machine:
+    disk: 10 # GB
diff --git a/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml

deleted file mode 120000 (symlink)

index 517b765..0000000
--- a/qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/cephfs/clusters/1-node-1-mds-1-osd.yaml
-\ No newline at end of file
diff --git a/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml b/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml

new file mode 120000 (symlink)

index 0000000..95633a0
--- /dev/null
+++ b/qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml
@@ -0,0 +1 @@
+.qa/cephfs/clusters/1-node-4-mds-1-osd.yaml
+\ No newline at end of file
diff --git a/qa/suites/fs/full/overrides.yaml b/qa/suites/fs/full/overrides.yaml

index 921528d66a554d4c49140fd98c7ae8ba5c724f92..afaab9b5a8d8727bfedfe9b29ac5f43b08aabbbf 100644 (file)
--- a/qa/suites/fs/full/overrides.yaml
+++ b/qa/suites/fs/full/overrides.yaml
@@ -17,3 +17,4 @@ overrides:
        - OSD_OUT_OF_ORDER_FULL
        - OSD_NEARFULL
        - OSD_FULL
+      - MGR_DOWN
diff --git a/qa/suites/fs/full/tasks/mgr-osd-full.yaml b/qa/suites/fs/full/tasks/mgr-osd-full.yaml

index df566545d1b4a95c5058a527b26c472695e90923..fffd3f1d7c3e7c8d9e553c544fce29a52372a40b 100644 (file)
--- a/qa/suites/fs/full/tasks/mgr-osd-full.yaml
+++ b/qa/suites/fs/full/tasks/mgr-osd-full.yaml
@@ -29,3 +29,8 @@ tasks:
      clients:
        client.0:
          - fs/full/subvolume_snapshot_rm.sh
+- workunit:
+    cleanup: true
+    clients:
+      client.0:
+        - fs/full/subvolume_ls.sh
diff --git a/qa/workunits/fs/full/subvolume_ls.sh b/qa/workunits/fs/full/subvolume_ls.sh

new file mode 100755 (executable)

index 0000000..7975c40
--- /dev/null
+++ b/qa/workunits/fs/full/subvolume_ls.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+set -ex
+
+# This testcase tests the scenario of the 'ceph fs subvolume ls' mgr command
+# when the osd is full. The command used to miss out few subvolumes in the list.
+# The issue happens in the multi-mds active setup. Please see the tracker
+# https://tracker.ceph.com/issues/72260
+
+# The suite sets the 'bluestore block size' to 2GiB. So, the osd is of the
+# size 2GiB. The 25 subvolumes are created and a 1GB file is written on the
+# root. The full-ratios are set such that, the data less than 500MB is
+# treated as osd full. Now, subvolumes are listed 20 times with mgr failover
+# (to invalidate readdir cache) and validated each time.
+
+SUBVOL_CNT=25
+
+expect_failure() {
+  if "$@"; then return 1; else return 0; fi
+}
+validate_subvol_cnt() {
+  if [ $1 -eq $SUBVOL_CNT ]; then return 0; else return 1; fi
+}
+restart_mgr() {
+  ceph mgr fail x
+  timeout=30
+  while [ $timeout -gt 0 ]
+  do
+    active_mgr_cnt=$(ceph status | grep mgr | grep active | grep -v no | wc -l)
+    if [ $active_mgr_cnt -eq 1 ]; then break; fi
+    echo "Waiting for mgr to be active after failover: $timeout"
+    sleep 1
+    let "timeout-=1"
+  done
+}
+
+#Set client_use_random_mds
+ceph config set client client_use_random_mds true
+
+#Set max_mds to 3
+ceph fs set cephfs max_mds 3
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  active_cnt=$(ceph fs status | grep active | wc -l)
+  if [ $active_cnt -eq 2 ]; then break; fi
+  echo "Wating for max_mds to be 2: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+#Create subvolumes
+for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume create cephfs sub_$i; done
+
+#For debugging
+echo "Before write"
+df -h
+ceph osd df
+
+sudo dd if=/dev/urandom of=$CEPH_MNT/1GB_file-1 status=progress bs=1M count=1000
+
+# The suite (qa/suites/fs/full/tasks/mgr-osd-full.yaml) sets the 'bluestore block size'
+# to 2GiB. So, the osd is of the size 2GiB. The full-ratios are set below makes sure
+# that the data less than 500MB is treated as osd full.
+ceph osd set-full-ratio 0.2
+ceph osd set-nearfull-ratio 0.16
+ceph osd set-backfillfull-ratio 0.18
+
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  health=$(ceph health detail)
+  [[ $health = *"OSD_FULL"* ]] && echo "OSD is full" && break
+  echo "Waiting for osd to be full: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+#For debugging
+echo "After ratio set"
+df -h
+ceph osd df
+
+#Clear readdir cache by failing over mgr which forces to use new libcephfs connection
+#Validate subvolume ls  20 times
+for i in {1..20};
+do
+  restart_mgr
+  #List and validate subvolumes count
+  subvol_cnt=$(ceph fs subvolume ls cephfs --format=json-pretty | grep sub_ | wc -l)
+  validate_subvol_cnt $subvol_cnt
+done
+
+#Delete all subvolumes
+for i in $(seq 1 $SUBVOL_CNT); do ceph fs subvolume rm cephfs sub_$i; done
+
+#Wait for subvolume to delete data
+trashdir=$CEPH_MNT/volumes/_deleting
+timeout=30
+while [ $timeout -gt 0 ]
+do
+  [ -z "$(sudo ls -A $trashdir)" ] && echo "Trash directory $trashdir is empty" &&  break
+  echo "Waiting for trash dir to be empty: $timeout"
+  sleep 1
+  let "timeout-=1"
+done
+
+sudo rm -f $CEPH_MNT/1GB_file-1
+
+#Set the ratios back for other full tests to run
+ceph osd set-full-ratio 0.95
+ceph osd set-nearfull-ratio 0.95
+ceph osd set-backfillfull-ratio 0.95
+
+#After test
+echo "After test"
+df -h
+ceph osd df
+
+echo OK
author	Kotresh HR <khiremat@redhat.com>
	Thu, 24 Jul 2025 17:31:12 +0000 (17:31 +0000)
committer	Jos Collin <jcollin@redhat.com>
	Mon, 8 Sep 2025 05:03:41 +0000 (10:33 +0530)
qa/cephfs/clusters/1-node-1-mds-1-osd.yaml	[deleted file]	patch \| blob \| history
qa/cephfs/clusters/1-node-4-mds-1-osd.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/fs/full/clusters/1-node-1-mds-1-osd.yaml	[deleted symlink]	patch \| blob \| history
qa/suites/fs/full/clusters/1-node-4-mds-1-osd.yaml	[new symlink]	patch \| blob
qa/suites/fs/full/overrides.yaml		patch \| blob \| history
qa/suites/fs/full/tasks/mgr-osd-full.yaml		patch \| blob \| history
qa/workunits/fs/full/subvolume_ls.sh	[new file with mode: 0755]	patch \| blob