From: Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
Date: Wed, 6 May 2026 15:11:33 +0000 (+0530)
Subject: osd: add last_degraded field to pg_stat_t
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=252d14923858b6695dad4a7d70f70ed3881abd28;p=ceph.git

osd: add last_degraded field to pg_stat_t

Introduce a 'last_degraded' timestamp to the pg_stat_t structure to track
the initial point of redundancy loss. This field, used in conjunction
with 'last_clean', allows the manager to calculate a cluster-wide
durability score by measuring the duration of vulnerability windows.

Changes:
1) Add last_degraded (utime_t) to pg_stat_t in osd_types.h.
2) Increment pg_stat_t encoding version to 31. The decode logic
   defaults last_degraded to last_clean for backward compatibility
   during rolling upgrades.
3) Update operator==, dump(), and generate_test_instances() to
   support ceph-dencoder testing and JSON output.
4) Implement latching logic in PeeringState::prepare_stats_for_publish():
   - A PG is considered vulnerable if in DEGRADED or UNDERSIZED state.
   - last_degraded is set to 'now' only if it is <= last_clean,
     effectively latching the timestamp to the start of the failure
     event until the PG next becomes clean.
5) Standalone tests to verify:
   - The last_degraded timestamp latching logic.
   - Verify last_degraded timestamp is modified when OSDs are marked 'out' for
     draining purposes in which case PGs are marked undersized.
6) Release note the addition of 'last_degraded' field to PG stats.

Fixes: https://tracker.ceph.com/issues/76604
Signed-off-by: Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index d94e35fa6b5..e330cc903b9 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -115,6 +115,12 @@
   invalidates entries when underlying cluster maps are updated. The cache is enabled
   by default. The previous `mgr_ttl_cache_expire_seconds` configuration option has
   been removed and replaced with `mgr_map_cache_enabled` (default: true).
+* The ``last_degraded`` timestamp is added to the ``pg_stat_t`` structure to
+  track the initial point of redundancy loss when a PG enters an undersized or
+  degraded state. This timestamp is latched until the PG returns to a clean
+  state and a subsequent redundancy loss occurs. Used in conjunction with
+  ``last_clean``, the ``last_degraded`` timestamp enables the calculation of
+  data vulnerability and durability scores.
 
 >=20.0.0
 
diff --git a/qa/standalone/osd/osd-recovery-stats.sh b/qa/standalone/osd/osd-recovery-stats.sh
index fd8087646dd..6a402d4914f 100755
--- a/qa/standalone/osd/osd-recovery-stats.sh
+++ b/qa/standalone/osd/osd-recovery-stats.sh
@@ -505,6 +505,217 @@ function TEST_recovery_multi() {
     kill_daemons $dir || return 1
 }
 
+function TEST_recovery_last_degraded_latching() {
+    local dir=$1
+    local osds=6
+
+    # Setup Cluster
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for i in $(seq 0 $(expr $osds - 1)); do
+      run_osd $dir $i || return 1
+    done
+
+    # Create Pool with specific replica counts
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname size 3
+    ceph osd pool set $poolname min_size 1
+    wait_for_clean || return 1
+
+    # Inject data
+    local numobjs=100
+    for i in $(seq 1 $numobjs); do
+      rados -p $poolname put obj$i /dev/null
+    done
+
+    # Identify PG and OSDs
+    local pgid=$(get_pg $poolname obj1)
+    local replicaosds=$(get_osds $poolname obj1 | awk '{print $2, $3}')
+    read -r osd_a osd_b <<< "$replicaosds"
+
+    # Capture baseline timestamp
+    local last_clean_start=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_clean')
+
+    # --- Step 1: Kill the first non-primary OSD (osd_a) ---
+    echo "Setting norecover to freeze PG state..."
+    ceph osd set norecover
+
+    echo "Stopping OSD.$osd_a..."
+    kill $(cat $dir/osd.${osd_a}.pid)
+    ceph osd down osd.${osd_a}
+    ceph osd out osd.${osd_a}
+
+    # 1.1 Wait and confirm state moves to degraded or undersized
+    local state=""
+    for i in $(seq 1 30); do
+      state=$(ceph pg $pgid query | jq -r '.info.stats.state')
+      echo "Current PG $pgid state: $state"
+      if [[ "$state" == *"degraded"* ]] || \
+         [[ "$state" == *"undersized"* ]]; then
+        break
+      fi
+      sleep 1
+    done
+
+    if [[ "$state" != *"degraded"* ]] && [[ "$state" != *"undersized"* ]]; then
+      echo "Error: PG $pgid state ($state) did not become " \
+           "degraded/undersized after killing osd.$osd_a."
+      return 1
+    fi
+
+    # 1.2 Confirm last_degraded updated
+    local last_degraded_t1=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Queried last_degraded (T1): $last_degraded_t1"
+    if [[ "$last_degraded_t1" > "$last_clean_start" ]]; then
+      echo "Confirmed: last_degraded ($last_degraded_t1) updated on failure."
+    else
+      echo "Error: last_degraded ($last_degraded_t1) is not newer than " \
+           "initial last_clean ($last_clean_start)."
+      return 1
+    fi
+
+    # --- Step 2: Kill the second non-primary OSD (osd_b) ---
+    echo "Stopping OSD.$osd_b..."
+    kill $(cat $dir/osd.${osd_b}.pid)
+    ceph osd down osd.${osd_b}
+    ceph osd out osd.${osd_b}
+
+    # 2.1 Confirm last_degraded remains latched (the same)
+    local last_degraded_t2=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Queried last_degraded (T2): $last_degraded_t2"
+    if [[ "$last_degraded_t2" == "$last_degraded_t1" ]]; then
+      echo "Test Passed: last_degraded timestamp remained " \
+           "stable at $last_degraded_t2."
+    else
+      echo "Test Failed: last_degraded updated to " \
+           "$last_degraded_t2 on second failure."
+      return 1
+    fi
+
+    # --- Step 3: Recovery ---
+    echo "Unsetting norecover and restarting OSDs..."
+    ceph osd unset norecover
+
+    echo "Restarting OSDs $osd_a and $osd_b..."
+    activate_osd $dir $osd_a
+    activate_osd $dir $osd_b
+    wait_for_clean || return 1
+
+    # --- Step 4: Final Verification ---
+    local final_stats=$(ceph pg $pgid query | \
+      jq -r '.info.stats | "\(.last_degraded) \(.last_clean)"')
+    read -r last_degraded_final last_clean_final <<< "$final_stats"
+
+    echo "Final Timestamps -> Last Degraded: $last_degraded_final, " \
+         "Last Clean: $last_clean_final"
+    if [[ "$last_clean_final" > "$last_degraded_final" ]]; then
+      echo "Test Passed: Recovery successful. last_clean ($last_clean_final) " \
+           "is newer than last_degraded ($last_degraded_final)."
+    else
+      echo "Test Failed: last_clean ($last_clean_final) was not updated " \
+           "correctly after recovery."
+      return 1
+    fi
+
+    # Cleanup
+    delete_pool $poolname
+    kill_daemons $dir || return 1
+}
+
+function TEST_recovery_last_degraded_undersized() {
+    local dir=$1
+    local osds=3
+
+    # 1. Setup Cluster
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for i in $(seq 0 $(expr $osds - 1)); do
+      run_osd $dir $i || return 1
+    done
+
+    # 2. Create Pool and force size 1
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname size 1 --yes-i-really-mean-it
+    wait_for_clean || return 1
+
+    # Inject data
+    for i in $(seq 1 50); do
+      rados -p $poolname put obj$i /dev/null
+    done
+
+    local pgid=$(get_pg $poolname obj1)
+    local primary=$(get_primary $poolname obj1)
+
+    # 3. Select Non-Primary OSD
+    local replica_osd=""
+    for i in $(seq 0 $(expr $osds - 1)); do
+      if [[ "$i" != "$primary" ]]; then
+          replica_osd=$i
+          break
+      fi
+    done
+    echo "Primary is OSD.$primary, selected OSD.$replica_osd to mark OUT."
+
+    local last_clean_start=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_clean')
+
+    # 4. Mark non-primary OSD out and set norecover
+    ceph osd set norecover
+    ceph osd out $replica_osd
+
+    # 5. Increase pool size to 4
+    echo "Increasing pool size to 4..."
+    ceph osd pool set $poolname size 4
+
+    # 6. Unset norecover and kick the recovery queue
+    echo "Starting recovery..."
+    ceph osd unset norecover
+    ceph tell osd.$primary debug kick_recovery_wq 0
+
+    sleep 10
+    flush_pg_stats || return 1
+
+    # 7. Custom recovery-wait logic
+    echo "Waiting for $pgid to be marked undersized..."
+    for i in $(seq 1 300); do
+      # Fetch only the stats for the specific PG in JSON format
+      local current_state=$(ceph pg $pgid query | jq -r '.info.stats.state')
+      echo "Iteration $i: PG $pgid state is [$current_state]"
+
+      # Check if 'recovering' is absent from the state string
+      if [[ "$current_state" != *"recovering"* ]]; then
+        echo "PG $pgid is marked undersized (current state: $current_state)."
+        break
+      fi
+      if [ "$i" = "300" ]; then
+        echo "Timeout waiting for $pgid to become undersized"
+        ceph pg $pgid query | jq .
+        return 1
+      fi
+      sleep 1
+    done
+
+    # 8. Verification
+    local last_degraded_final=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Initial Clean:  $last_clean_start"
+    echo "Final Degraded: $last_degraded_final"
+
+    if [[ "$last_degraded_final" > "$last_clean_start" ]]; then
+      echo "Test Passed: last_degraded updated correctly."
+    else
+      echo "Test Failed: last_degraded ($last_degraded_final) was not updated."
+      return 1
+    fi
+
+    # Cleanup
+    delete_pool $poolname
+    kill_daemons $dir || return 1
+}
+
 main osd-recovery-stats "$@"
 
 # Local Variables:
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index a6d12d1e01f..e0effaaa85d 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -4498,6 +4498,17 @@ std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
       info.stats.last_fullsized = now;
 
+    // check if the PG is vulnerable
+    if (info.stats.state & (PG_STATE_DEGRADED|PG_STATE_UNDERSIZED)) {
+      // set last_degraded only if we are entering a new
+      // failure state and if it's older than last_clean
+      if (info.stats.last_degraded <= info.stats.last_clean) {
+        info.stats.last_degraded = now;
+      }
+    }
+    // update pre_publish so the change is sent immediately
+    pre_publish.last_degraded = info.stats.last_degraded;
+
     psdout(15) << "publish_stats_to_osd " << pre_publish.reported_epoch
 	       << ":" << pre_publish.reported_seq << dendl;
     return std::make_optional(std::move(pre_publish));
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 873163fd9a2..d9fe5491b8e 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2947,6 +2947,7 @@ void pg_stat_t::dump(Formatter *f) const
   f->dump_stream("last_active") << last_active;
   f->dump_stream("last_peered") << last_peered;
   f->dump_stream("last_clean") << last_clean;
+  f->dump_stream("last_degraded") << last_degraded;
   f->dump_stream("last_became_active") << last_became_active;
   f->dump_stream("last_became_peered") << last_became_peered;
   f->dump_stream("last_unstale") << last_unstale;
@@ -3094,7 +3095,7 @@ bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r)
 
 void pg_stat_t::encode(ceph::buffer::list &bl) const
 {
-  ENCODE_START(30, 22, bl);
+  ENCODE_START(31, 22, bl);
   encode(version, bl);
   encode(reported_seq, bl);
   encode(reported_epoch, bl);
@@ -3157,6 +3158,7 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const
   encode(scrub_sched_status.m_osd_to_respond, bl);
   encode(scrub_sched_status.m_ordinal_of_requested_replica, bl);
   encode(scrub_sched_status.m_num_to_reserve, bl);
+  encode(last_degraded, bl);
 
   ENCODE_FINISH(bl);
 }
@@ -3165,7 +3167,7 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
 {
   bool tmp;
   uint32_t old_state;
-  DECODE_START(30, bl);
+  DECODE_START(31, bl);
   decode(version, bl);
   decode(reported_seq, bl);
   decode(reported_epoch, bl);
@@ -3267,6 +3269,11 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
     } else {
       scrub_sched_status.m_num_to_reserve = 0;
     }
+    if (struct_v >= 31) {
+      decode(last_degraded, bl);
+    } else {
+      last_degraded = last_clean;
+    }
   }
   DECODE_FINISH(bl);
 }
@@ -3290,6 +3297,7 @@ list<pg_stat_t> pg_stat_t::generate_test_instances()
   a.last_unstale = utime_t(1002, 5);
   a.last_undegraded = utime_t(1002, 7);
   a.last_fullsized = utime_t(1002, 8);
+  a.last_degraded = utime_t(1002, 9);
   a.log_start = eversion_t(1, 4);
   a.ondisk_log_start = eversion_t(1, 5);
   a.created = 6;
@@ -3328,6 +3336,7 @@ list<pg_stat_t> pg_stat_t::generate_test_instances()
   a.acting_primary = 124;
   a.blocked_by.push_back(155);
   a.blocked_by.push_back(156);
+  a.last_degraded = utime_t(1005, 1);
   o.push_back(pg_stat_t(a));
 
   return o;
@@ -3387,7 +3396,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r)
     l.objects_scrubbed == r.objects_scrubbed &&
     l.scrub_duration == r.scrub_duration &&
     l.objects_trimmed == r.objects_trimmed &&
-    l.snaptrim_duration == r.snaptrim_duration;
+    l.snaptrim_duration == r.snaptrim_duration &&
+    l.last_degraded == r.last_degraded;
 }
 
 // -- store_statfs_t --
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 846ffc6edf4..74226b60a2c 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2333,6 +2333,7 @@ struct pg_stat_t {
   utime_t last_active;  // state & PG_STATE_ACTIVE
   utime_t last_peered;  // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
   utime_t last_clean;   // state & PG_STATE_CLEAN
+  utime_t last_degraded; // state & (PG_STATE_DEGRADED | PG_STATE_UNDERSIZED)
   utime_t last_unstale; // (state & PG_STATE_STALE) == 0
   utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
   utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0