From: Ilya Dryomov Date: Wed, 27 Sep 2023 18:17:11 +0000 (+0200) Subject: qa/suites/krbd: stress test for recovering from watch errors X-Git-Tag: v18.2.1~197^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=1015937b553f5f4c466629ee1c3b6648b1093ca7;p=ceph.git qa/suites/krbd: stress test for recovering from watch errors Fixes: https://tracker.ceph.com/issues/63010 Signed-off-by: Ilya Dryomov (cherry picked from commit 237aa221ebad429457a621d2e38cfdf0025e38f9) --- diff --git a/qa/suites/krbd/singleton/% b/qa/suites/krbd/singleton/% new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/qa/suites/krbd/singleton/.qa b/qa/suites/krbd/singleton/.qa new file mode 120000 index 0000000000000..a602a0353e751 --- /dev/null +++ b/qa/suites/krbd/singleton/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/qa/suites/krbd/singleton/bluestore-bitmap.yaml b/qa/suites/krbd/singleton/bluestore-bitmap.yaml new file mode 120000 index 0000000000000..a59cf5175069a --- /dev/null +++ b/qa/suites/krbd/singleton/bluestore-bitmap.yaml @@ -0,0 +1 @@ +.qa/objectstore/bluestore-bitmap.yaml \ No newline at end of file diff --git a/qa/suites/krbd/singleton/conf.yaml b/qa/suites/krbd/singleton/conf.yaml new file mode 100644 index 0000000000000..41292fa813980 --- /dev/null +++ b/qa/suites/krbd/singleton/conf.yaml @@ -0,0 +1,8 @@ +overrides: + ceph: + conf: + global: + mon warn on pool no app: false + ms die on skipped message: false + client: + rbd default features: 37 diff --git a/qa/suites/krbd/singleton/ms_mode$/.qa b/qa/suites/krbd/singleton/ms_mode$/.qa new file mode 120000 index 0000000000000..a602a0353e751 --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/qa/suites/krbd/singleton/ms_mode$/crc-rxbounce.yaml b/qa/suites/krbd/singleton/ms_mode$/crc-rxbounce.yaml new file mode 100644 index 0000000000000..4d27d01133cdc --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/crc-rxbounce.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=crc,rxbounce diff --git a/qa/suites/krbd/singleton/ms_mode$/crc.yaml b/qa/suites/krbd/singleton/ms_mode$/crc.yaml new file mode 100644 index 0000000000000..3b072578f1fde --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/crc.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=crc diff --git a/qa/suites/krbd/singleton/ms_mode$/legacy-rxbounce.yaml b/qa/suites/krbd/singleton/ms_mode$/legacy-rxbounce.yaml new file mode 100644 index 0000000000000..244e45cbc7641 --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/legacy-rxbounce.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=legacy,rxbounce diff --git a/qa/suites/krbd/singleton/ms_mode$/legacy.yaml b/qa/suites/krbd/singleton/ms_mode$/legacy.yaml new file mode 100644 index 0000000000000..0048dcb0cec2a --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/legacy.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=legacy diff --git a/qa/suites/krbd/singleton/ms_mode$/secure.yaml b/qa/suites/krbd/singleton/ms_mode$/secure.yaml new file mode 100644 index 0000000000000..a735db18d2c30 --- /dev/null +++ b/qa/suites/krbd/singleton/ms_mode$/secure.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=secure diff --git a/qa/suites/krbd/singleton/tasks/.qa b/qa/suites/krbd/singleton/tasks/.qa new file mode 120000 index 0000000000000..a602a0353e751 --- /dev/null +++ b/qa/suites/krbd/singleton/tasks/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml b/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml new file mode 100644 index 0000000000000..5e30ef2ba419e --- /dev/null +++ b/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + conf: + global: + osd pool default size: 1 + osd: + osd shutdown pgref assert: true +roles: +- [mon.a, mgr.x, osd.0, client.0] + +tasks: +- install: + extra_system_packages: + - fio +- ceph: +- workunit: + clients: + all: + - rbd/krbd_watch_errors.sh diff --git a/qa/workunits/rbd/krbd_watch_errors.sh b/qa/workunits/rbd/krbd_watch_errors.sh new file mode 100755 index 0000000000000..f650d2a748944 --- /dev/null +++ b/qa/workunits/rbd/krbd_watch_errors.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -ex +set -o pipefail + +function refresh_loop() { + local dev_id="$1" + + set +x + + local i + for ((i = 1; ; i++)); do + echo 1 | sudo tee "${SYSFS_DIR}/${dev_id}/refresh" > /dev/null + if ((i % 100 == 0)); then + echo "Refreshed ${i} times" + fi + done +} + +readonly SYSFS_DIR="/sys/bus/rbd/devices" +readonly IMAGE_NAME="watch-errors-test" + +rbd create -s 1G --image-feature exclusive-lock "${IMAGE_NAME}" + +# induce a watch error every 30 seconds +dev="$(sudo rbd device map -o osdkeepalive=60 "${IMAGE_NAME}")" +dev_id="${dev#/dev/rbd}" + +# constantly refresh, not just on watch errors +refresh_loop "${dev_id}" & +refresh_pid=$! + +sudo dmesg -C + +# test that none of the above triggers a deadlock with a workload +fio --name test --filename="${dev}" --ioengine=libaio --direct=1 \ + --rw=randwrite --norandommap --randrepeat=0 --bs=512 --iodepth=128 \ + --time_based --runtime=1h --eta=never + +num_errors="$(dmesg | grep -c "rbd${dev_id}: encountered watch error")" +echo "Recorded ${num_errors} watch errors" + +kill "${refresh_pid}" +wait + +sudo rbd device unmap "${dev}" + +if ((num_errors < 60)); then + echo "Too few watch errors" + exit 1 +fi + +echo OK