From: Kamoltat (Junior) Sirivadhna Date: Tue, 10 Feb 2026 20:10:47 +0000 (+0000) Subject: qa/standalone: add polling for PG log trimming in osd-backfill-recovery-log X-Git-Tag: testing/wip-vshankar-testing-20260224.100235~11^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6a02f6b4612eabc23c9bbc9d4eefee906aba2458;p=ceph-ci.git qa/standalone: add polling for PG log trimming in osd-backfill-recovery-log Add a 30-second polling loop after flush_pg_stats to wait for PG log and duplicate entries to be trimmed to their expected sizes before validation. This addresses timing issues where the test was inspecting the objectstore before log trimming operations completed. The loop polls 'ceph pg query' to check both log and dups lengths, breaking when both reach or fall below expected thresholds. This prevents spurious test failures on varied teuthology machines where log trimming happens at different speed after recovery completes. Solves intermittent failures where logs showed 50 entries instead of expected 2, and dups showed 7 instead of expected 8. Fixes: https://tracker.ceph.com/issues/74524 Signed-off-by: Kamoltat (Junior) Sirivadhna --- diff --git a/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh b/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh index f9a14493215..4712c3a5527 100755 --- a/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh +++ b/qa/standalone/osd-backfill/osd-backfill-recovery-log.sh @@ -77,6 +77,26 @@ function _common_test() { flush_pg_stats + # Wait for log and dups to reach expected sizes (trimming may take time) + TIMEOUT=30 + count=0 + while true; do + current_log_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_size' 2>/dev/null || echo "999") + current_dups_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_dups_size' 2>/dev/null || echo "999") + + if [ "$current_log_len" -le "$loglen" ] && [ "$current_dups_len" -le "$dupslen" ]; then + echo "Log trimming complete: log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)" + break + fi + + sleep 1 + count=$((count + 1)) + if [ $count -gt $TIMEOUT ]; then + echo "WARNING: Log trimming timeout after ${TIMEOUT}s - log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)" + break + fi + done + newprimary=$(ceph pg dump pgs --format=json | jq '.pg_stats[0].up_primary') kill_daemons