From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 24 Jul 2024 06:44:46 +0000 (+0200)
Subject: qa/suites/krbd: stress test for recovering from watch errors for -o exclusive
X-Git-Tag: v19.1.1~52^2~1
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=448e260dd5637512126a6b1a3234d7dc56eca6e9;p=ceph.git

qa/suites/krbd: stress test for recovering from watch errors for -o exclusive

This is based on a test added in commit 237aa221ebad ("qa/suites/krbd:
stress test for recovering from watch errors") for regular mappings.

Fixes: https://tracker.ceph.com/issues/67097
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit 8fee41da8b8cd250bbbd8490604193c0864c1295)
---

diff --git a/qa/suites/krbd/singleton/tasks/krbd_watch_errors_exclusive.yaml b/qa/suites/krbd/singleton/tasks/krbd_watch_errors_exclusive.yaml
new file mode 100644
index 0000000000000..aeab129ed7ecf
--- /dev/null
+++ b/qa/suites/krbd/singleton/tasks/krbd_watch_errors_exclusive.yaml
@@ -0,0 +1,19 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        osd pool default size: 1
+      osd:
+        osd shutdown pgref assert: true
+roles:
+- [mon.a, mgr.x, osd.0, client.0]
+
+tasks:
+- install:
+    extra_system_packages:
+      - fio
+- ceph:
+- workunit:
+    clients:
+      all:
+        - rbd/krbd_watch_errors_exclusive.sh
diff --git a/qa/workunits/rbd/krbd_watch_errors_exclusive.sh b/qa/workunits/rbd/krbd_watch_errors_exclusive.sh
new file mode 100755
index 0000000000000..e0b9586ec66f8
--- /dev/null
+++ b/qa/workunits/rbd/krbd_watch_errors_exclusive.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+set -ex
+set -o pipefail
+
+readonly IMAGE_NAME="watch-errors-exclusive-test"
+
+rbd create -s 1G --image-feature exclusive-lock,object-map "${IMAGE_NAME}"
+
+# induce a watch error every 30 seconds
+dev="$(sudo rbd device map -o exclusive,osdkeepalive=60 "${IMAGE_NAME}")"
+dev_id="${dev#/dev/rbd}"
+
+sudo dmesg -C
+
+# test that a workload doesn't encounter EIO errors
+fio --name test --filename="${dev}" --ioengine=libaio --direct=1 \
+    --rw=randwrite --norandommap --randrepeat=0 --bs=512 --iodepth=128 \
+    --time_based --runtime=1h --eta=never
+
+num_errors="$(dmesg | grep -c "rbd${dev_id}: encountered watch error")"
+echo "Recorded ${num_errors} watch errors"
+
+sudo rbd device unmap "${dev}"
+
+if ((num_errors < 60)); then
+    echo "Too few watch errors"
+    exit 1
+fi
+
+echo OK