]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/suites/rbd: add test to check rbd_support module recovery 54291/head
authorRamana Raja <rraja@redhat.com>
Mon, 18 Sep 2023 02:52:56 +0000 (22:52 -0400)
committerRamana Raja <rraja@redhat.com>
Wed, 1 Nov 2023 12:55:07 +0000 (08:55 -0400)
... on repeated blocklisting of its client.

There were issues with rbd_support module not being able to recover
from its RADOS client being repeatedly blocklisted. This occured for
example in clusters with OSDs slow to process RBD requests while the
module's mirror_snapshot_scheduler was taking mirror snapshots by
requesting exclusive locks on the RBD images and workloads were running
on the snapshotted images via kernel clients.

Fixes: https://tracker.ceph.com/issues/62891
Signed-off-by: Ramana Raja <rraja@redhat.com>
(cherry picked from commit 2f2cd3bcff82afc3a4d251143eb462e700e7fc60)

qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml [new file with mode: 0644]
qa/workunits/rbd/rbd_support_module_recovery.sh [new file with mode: 0755]

diff --git a/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml b/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml
new file mode 100644 (file)
index 0000000..aa4d000
--- /dev/null
@@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    conf:
+      mgr:
+        debug rbd: 20
+tasks:
+- install:
+    extra_system_packages:
+      - fio
+- workunit:
+    clients:
+      client.0:
+        - rbd/rbd_support_module_recovery.sh
diff --git a/qa/workunits/rbd/rbd_support_module_recovery.sh b/qa/workunits/rbd/rbd_support_module_recovery.sh
new file mode 100755 (executable)
index 0000000..e9defce
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/bash
+set -ex
+
+POOL=rbd
+IMAGE_PREFIX=image
+NUM_IMAGES=20
+RUN_TIME=3600
+
+rbd mirror pool enable ${POOL} image
+rbd mirror pool peer add ${POOL} dummy
+
+# Create images and schedule their mirror snapshots
+for ((i = 1; i <= ${NUM_IMAGES}; i++)); do
+    rbd create -s 1G --image-feature exclusive-lock ${POOL}/${IMAGE_PREFIX}$i
+    rbd mirror image enable ${POOL}/${IMAGE_PREFIX}$i snapshot
+    rbd mirror snapshot schedule add -p ${POOL} --image ${IMAGE_PREFIX}$i 1m
+done
+
+# Run fio workloads on images via kclient
+# Test the recovery of the rbd_support module and its scheduler from their
+# librbd client being blocklisted while a exclusive lock gets passed around
+# between their librbd client and a kclient trying to take mirror snapshots
+# and perform I/O on the same image.
+for ((i = 1; i <= ${NUM_IMAGES}; i++)); do
+    DEVS[$i]=$(sudo rbd device map ${POOL}/${IMAGE_PREFIX}$i)
+    fio --name=fiotest --filename=${DEVS[$i]} --rw=randrw --bs=4K --direct=1 \
+        --ioengine=libaio --iodepth=2 --runtime=43200 --time_based \
+        &> /dev/null &
+done
+
+# Repeatedly blocklist rbd_support module's client ~10s after the module
+# recovers from previous blocklisting
+CURRENT_TIME=$(date +%s)
+END_TIME=$((CURRENT_TIME + RUN_TIME))
+PREV_CLIENT_ADDR=""
+CLIENT_ADDR=""
+while ((CURRENT_TIME <= END_TIME)); do
+    if [[ -n "${CLIENT_ADDR}" ]] &&
+       [[ "${CLIENT_ADDR}" != "${PREV_CLIENT_ADDR}" ]]; then
+            ceph osd blocklist add ${CLIENT_ADDR}
+            # Confirm rbd_support module's client is blocklisted
+            ceph osd blocklist ls | grep -q ${CLIENT_ADDR}
+            PREV_CLIENT_ADDR=${CLIENT_ADDR}
+    fi
+    sleep 10
+    CLIENT_ADDR=$(ceph mgr dump |
+        jq .active_clients[] |
+        jq 'select(.name == "rbd_support")' |
+        jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add')
+    CURRENT_TIME=$(date +%s)
+done
+
+# Confirm that rbd_support module recovered from repeated blocklisting
+# Check that you can add a mirror snapshot schedule after a few retries
+for ((i = 1; i <= 24; i++)); do
+    rbd mirror snapshot schedule add -p ${POOL} \
+        --image ${IMAGE_PREFIX}1 2m && break
+    sleep 10
+done
+rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 |
+    grep 'every 2m'
+# Verify that the schedule present before client blocklisting is preserved
+rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 |
+    grep 'every 1m'
+rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}1 2m
+for ((i = 1; i <= ${NUM_IMAGES}; i++)); do
+    rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}$i 1m
+done
+
+# cleanup
+killall fio || true
+wait
+for ((i = 1; i <= ${NUM_IMAGES}; i++)); do
+    sudo rbd device unmap ${DEVS[$i]}
+done
+
+echo OK