From 6a02f6b4612eabc23c9bbc9d4eefee906aba2458 Mon Sep 17 00:00:00 2001 From: "Kamoltat (Junior) Sirivadhna" Date: Tue, 10 Feb 2026 20:10:47 +0000 Subject: [PATCH] qa/standalone: add polling for PG log trimming in osd-backfill-recovery-log Add a 30-second polling loop after flush_pg_stats to wait for PG log and duplicate entries to be trimmed to their expected sizes before validation. This addresses timing issues where the test was inspecting the objectstore before log trimming operations completed. The loop polls 'ceph pg query' to check both log and dups lengths, breaking when both reach or fall below expected thresholds. This prevents spurious test failures on varied teuthology machines where log trimming happens at different speed after recovery completes. Solves intermittent failures where logs showed 50 entries instead of expected 2, and dups showed 7 instead of expected 8. Fixes: https://tracker.ceph.com/issues/74524 Signed-off-by: Kamoltat (Junior) Sirivadhna --- .../osd-backfill/osd-backfill-recovery-log.sh | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh b/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh index f9a14493215..4712c3a5527 100755 --- a/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh +++ b/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh @@ -77,6 +77,26 @@ function _common_test() { flush_pg_stats + # Wait for log and dups to reach expected sizes (trimming may take time) + TIMEOUT=30 + count=0 + while true; do + current_log_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_size' 2>/dev/null || echo "999") + current_dups_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_dups_size' 2>/dev/null || echo "999") + + if [ "$current_log_len" -le "$loglen" ] && [ "$current_dups_len" -le "$dupslen" ]; then + echo "Log trimming complete: log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)" + break + fi + + sleep 1 + count=$((count + 1)) + if [ $count -gt $TIMEOUT ]; then + echo "WARNING: Log trimming timeout after ${TIMEOUT}s - log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)" + break + fi + done + newprimary=$(ceph pg dump pgs --format=json | jq '.pg_stats[0].up_primary') kill_daemons -- 2.47.3