]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/crimson: add coredump generation test using ASOK assert
authorRonen Friedman <rfriedma@redhat.com>
Sun, 10 May 2026 13:16:02 +0000 (13:16 +0000)
committerRonen Friedman <rfriedma@redhat.com>
Fri, 15 May 2026 10:32:13 +0000 (10:32 +0000)
Trigger a crash on a Crimson OSD via the admin socket 'assert'
command and verify the OSD goes down and a coredump is produced.
Exercises the debug_asok_assert_abort path added in the companion
commit.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
qa/suites/crimson-rados/singleton/all/test-coredump.yaml [new file with mode: 0644]
qa/workunits/crimson/test_coredump.sh [new file with mode: 0755]

diff --git a/qa/suites/crimson-rados/singleton/all/test-coredump.yaml b/qa/suites/crimson-rados/singleton/all/test-coredump.yaml
new file mode 100644 (file)
index 0000000..5df8e81
--- /dev/null
@@ -0,0 +1,37 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+openstack:
+  - volumes:
+      count: 3
+      size: 10 # GB
+tasks:
+- install:
+- ceph:
+    pre-mgr-commands:
+      - sudo ceph config set mgr mgr_pool false --force
+    log-ignorelist:
+      - but it is still running
+      - overall HEALTH_
+      - \(RECENT_CRASH\)
+      - \(OSD_DOWN\)
+      - \(OSD_.*DOWN\)
+      - \(PG_DEGRADED\)
+      - \(PG_
+      - Reduced data availability
+    conf:
+      osd:
+        osd min pg log entries: 5
+        crimson cpu num: 2
+        crimson memory: 16G
+      global:
+- workunit:
+    clients:
+      client.0:
+        - crimson/test_coredump.sh
diff --git a/qa/workunits/crimson/test_coredump.sh b/qa/workunits/crimson/test_coredump.sh
new file mode 100755 (executable)
index 0000000..cd90123
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+. $(dirname $0)/../../standalone/ceph-helpers.sh
+
+set -ex
+
+# Skip unless running on a Crimson OSD
+[ "$(ceph osd metadata 0 | jq -r '.osd_type')" == "crimson" ] || exit 0
+
+# Write some data so the OSD has real work behind it
+ceph osd pool create testpool 8 8
+rados -p testpool bench 10 write --no-cleanup
+
+sleep 5
+
+# Enable the admin socket assert command
+ceph tell osd.0 config set debug_asok_assert_abort true
+
+# Fire the assert — this crashes OSD 0. The command will fail because the
+# OSD dies mid-request, so we ignore the exit code.
+timeout 30 ceph tell osd.0 assert || true
+
+# Wait for the monitor to notice OSD 0 is down
+for i in $(seq 1 60); do
+    if ceph osd dump --format json | jq -e '.osds[] | select(.osd == 0) | .up == 0' >/dev/null 2>&1; then
+        break
+    fi
+    sleep 1
+done
+
+ceph osd dump --format json
+ceph osd dump --format json | jq -e '.osds[] | select(.osd == 0) | .up == 0' || \
+    { echo "OSD 0 did not go down after assert"; exit 1; }
+
+# Verify that a coredump was produced. Leave it in place for
+# downstream coredump/binary collection validation.
+find $TESTDIR/archive/coredump -type f -ls 2>/dev/null
+coredump_count=$(find $TESTDIR/archive/coredump -type f 2>/dev/null | wc -l)
+if [ "$coredump_count" -gt 0 ]; then
+    echo "Found $coredump_count coredump file(s) — OK"
+else
+    echo "WARNING: no coredump files found (may be a system coredump config issue)"
+fi
+
+echo "OK"