]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
qa: Add Teuthology test for BlueStore ESB assertion failure
authorJaya Prakash <jayaprakash@ibm.com>
Wed, 16 Apr 2025 10:50:30 +0000 (16:20 +0530)
committerJaya Prakash <jayaprakash@ibm.com>
Mon, 2 Jun 2025 18:21:24 +0000 (23:51 +0530)
Adds a test to reproduce the !ito->is_valid() assertion in BlueStore
with bluestore_elastic_shared_blobs=true on a 2+1 EC pool using a
FIO randwrite workload (512 concurrent ops, 50G, 12,500 objects).
The test deploys a 6-OSD cluster and runs FIO for 1 hour via workunit,
failing if an OSD crashes.

Signed-off-by: Jaya Prakash <jayaprakash@ibm.com>
qa/suites/rados/singleton/all/ec-esb-fio.yaml [new file with mode: 0644]
qa/workunits/rados/ec-esb-fio.sh [new file with mode: 0755]

diff --git a/qa/suites/rados/singleton/all/ec-esb-fio.yaml b/qa/suites/rados/singleton/all/ec-esb-fio.yaml
new file mode 100644 (file)
index 0000000..a2818d0
--- /dev/null
@@ -0,0 +1,46 @@
+meta:
+  - desc:
+      all/ec-esb-fio
+
+roles:
+- - mon.a
+  - mgr.x
+  - client.0
+- - osd.0
+  - osd.1
+- - osd.2
+  - osd.3
+- - osd.4
+  - osd.5
+openstack:
+  - volumes: # attached to each instance
+      count: 6
+      size: 20 # GB
+
+overrides:
+  ceph:
+    conf:
+      osd:
+        bluestore write v2: false
+        debug osd: 5
+        debug bluestore: 5
+        bluestore_elastic_shared_blobs: true
+        osd memory target: 939524096
+        bluestore onode segment size: 0
+
+tasks:
+- install:
+- ceph:
+    log-ignorelist:
+      - \(POOL_APP_NOT_ENABLED\)
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(OBJECT_
+      - \(PG_
+      - \(SLOW_OPS\)
+      - overall HEALTH
+      - slow request
+- workunit:
+    clients:
+      client.0:
+        - rados/ec-esb-fio.sh
diff --git a/qa/workunits/rados/ec-esb-fio.sh b/qa/workunits/rados/ec-esb-fio.sh
new file mode 100755 (executable)
index 0000000..32866f7
--- /dev/null
@@ -0,0 +1,99 @@
+#!/bin/bash
+# vim: ts=8 sw=2 smarttab
+set -ex
+
+# Install FIO
+if [[ -f /etc/debian_version ]]; then
+    sudo apt-get update
+    sudo apt-get install -y git gcc make librados-dev librbd-dev zlib1g-dev libaio-dev
+    git clone -b master https://github.com/axboe/fio.git /home/ubuntu/cephtest/fio
+    cd /home/ubuntu/cephtest/fio
+    ./configure
+    make
+    sudo make install
+    cd -
+elif [[ -f /etc/redhat-release ]]; then
+    sudo yum install -y fio
+else
+    echo "Unsupported OS"
+    exit 1
+fi
+
+sleep 10
+
+ceph config set osd osd_memory_target 939524096
+ceph config set osd bluestore_onode_segment_size 0
+ceph osd erasure-code-profile set myecprofile k=2 m=1
+ceph osd pool create ecpool 16 16 erasure myecprofile
+ceph osd pool set ecpool allow_ec_overwrites true
+
+status_log() {
+    echo "Cluster status on failure:"
+    ceph -s
+    ceph health detail
+}
+
+cleanup() {
+    ceph osd pool rm ecpool ecpool --yes-i-really-really-mean-it || true
+    ceph osd erasure-code-profile rm myecprofile || true
+    rm -rf /home/ubuntu/cephtest/fio || true
+    status_log
+}
+
+trap cleanup EXIT INT TERM
+
+echo "[ec-esb-fio] Starting FIO test..."
+
+
+fio --name=test-ec-esb \
+    --ioengine=rados \
+    --pool=ecpool \
+    --clientname=admin \
+    --conf=/etc/ceph/ceph.conf \
+    --time_based=1 \
+    --runtime=1h \
+    --invalidate=0 \
+    --direct=1 \
+    --touch_objects=0 \
+    --iodepth=32 \
+    --numjobs=4 \
+    --rw=randwrite \
+    --file_service_type=pareto:0.20:0 \
+    --bssplit=4k/16:8k/10:12k/9:16k/8:20k/7:24k/7 \
+    --size=15G \
+    --nrfiles=12500 \
+    --filename_format=stress_obj.\$jobnum.\$filenum \
+    &
+
+FIO_PID=$!
+
+ceph config dump | grep bluestore_elastic_shared_blobs || true
+ceph config dump | grep bluestore_onode_segment_size || true
+ceph osd dump | grep -A 10 ecpool || true
+
+
+TIMEOUT=3600
+START_TIME=$(date +%s)
+while true; do
+    CURRENT_TIME=$(date +%s)
+    ELAPSED=$((CURRENT_TIME - START_TIME))
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo "Reached 1-hour timeout, stopping FIO"
+        break
+    fi
+    if ceph health detail | grep -i "osd.*down"; then
+        echo "Detected OSD down state:"
+       ceph health detail | grep -i "osd.*down"
+        echo "Cleaning up..."
+       if [[ -n "$FIO_PID" ]]; then
+         kill -9 $FIO_PID || true
+       fi
+          exit 1
+    fi
+    ceph -s
+    ceph tell osd.0 perf dump bluestore | grep -A 2 onode || true
+    sleep 60
+done
+
+echo "[ec-esb-fio] FIO test completed, log checks to follow"
+exit 0