]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
qa/standalone: add polling for PG log trimming in osd-backfill-recovery-log
authorKamoltat (Junior) Sirivadhna <ksirivad@redhat.com>
Tue, 10 Feb 2026 20:10:47 +0000 (20:10 +0000)
committerKamoltat (Junior) Sirivadhna <ksirivad@redhat.com>
Tue, 10 Feb 2026 21:09:34 +0000 (21:09 +0000)
Add a 30-second polling loop after flush_pg_stats to wait for PG log
and duplicate entries to be trimmed to their expected sizes before
validation. This addresses timing issues where the test was inspecting
the objectstore before log trimming operations completed.

The loop polls 'ceph pg query' to check both log and dups lengths,
breaking when both reach or fall below expected thresholds. This
prevents spurious test failures on varied teuthology machines where log
trimming happens at different speed after recovery completes.

Solves intermittent failures where logs showed 50 entries instead of
expected 2, and dups showed 7 instead of expected 8.

Fixes: https://tracker.ceph.com/issues/74524
Signed-off-by: Kamoltat (Junior) Sirivadhna <ksirivad@redhat.com>
qa/standalone/osd-backfill/osd-backfill-recovery-log.sh

index f9a14493215d2dec256bae71738c1db2cedd841e..4712c3a5527bb39eb4d27e75fb8baf00b407ad00 100755 (executable)
@@ -77,6 +77,26 @@ function _common_test() {
 
     flush_pg_stats
 
+    # Wait for log and dups to reach expected sizes (trimming may take time)
+    TIMEOUT=30
+    count=0
+    while true; do
+      current_log_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_size' 2>/dev/null || echo "999")
+      current_dups_len=$(ceph pg 1.0 query 2>/dev/null | jq '.info.stats.log_dups_size' 2>/dev/null || echo "999")
+      
+      if [ "$current_log_len" -le "$loglen" ] && [ "$current_dups_len" -le "$dupslen" ]; then
+        echo "Log trimming complete: log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)"
+        break
+      fi
+      
+      sleep 1
+      count=$((count + 1))
+      if [ $count -gt $TIMEOUT ]; then
+        echo "WARNING: Log trimming timeout after ${TIMEOUT}s - log=$current_log_len (expected <=$loglen), dups=$current_dups_len (expected <=$dupslen)"
+        break
+      fi
+    done
+
     newprimary=$(ceph pg dump pgs --format=json | jq '.pg_stats[0].up_primary')
     kill_daemons