osd: add last_degraded field to pg_stat_t

author Sridhar Seshasayee <sridhar.seshasayee@ibm.com>

Wed, 6 May 2026 15:11:33 +0000 (20:41 +0530)

committer Sridhar Seshasayee <sridhar.seshasayee@ibm.com>

Thu, 4 Jun 2026 14:14:56 +0000 (19:44 +0530)
author Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
Wed, 6 May 2026 15:11:33 +0000 (20:41 +0530)
committer Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
Thu, 4 Jun 2026 14:14:56 +0000 (19:44 +0530)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index d94e35fa6b522730ce666366da9274e0215b6bb4..e330cc903b9a0a98be075869fb00791cb44a0604 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -115,6 +115,12 @@
    invalidates entries when underlying cluster maps are updated. The cache is enabled
    by default. The previous `mgr_ttl_cache_expire_seconds` configuration option has
    been removed and replaced with `mgr_map_cache_enabled` (default: true).
+* The ``last_degraded`` timestamp is added to the ``pg_stat_t`` structure to
+  track the initial point of redundancy loss when a PG enters an undersized or
+  degraded state. This timestamp is latched until the PG returns to a clean
+  state and a subsequent redundancy loss occurs. Used in conjunction with
+  ``last_clean``, the ``last_degraded`` timestamp enables the calculation of
+  data vulnerability and durability scores.
  
  >=20.0.0
  
diff --git a/qa/standalone/osd/osd-recovery-stats.sh b/qa/standalone/osd/osd-recovery-stats.sh

index fd8087646dda62bce703cf309ca7e38b62a09a7e..6a402d4914f61916d8c9e80a249d372c67166140 100755 (executable)
--- a/qa/standalone/osd/osd-recovery-stats.sh
+++ b/qa/standalone/osd/osd-recovery-stats.sh
@@ -505,6 +505,217 @@ function TEST_recovery_multi() {
      kill_daemons $dir || return 1
  }
  
+function TEST_recovery_last_degraded_latching() {
+    local dir=$1
+    local osds=6
+
+    # Setup Cluster
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for i in $(seq 0 $(expr $osds - 1)); do
+      run_osd $dir $i || return 1
+    done
+
+    # Create Pool with specific replica counts
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname size 3
+    ceph osd pool set $poolname min_size 1
+    wait_for_clean || return 1
+
+    # Inject data
+    local numobjs=100
+    for i in $(seq 1 $numobjs); do
+      rados -p $poolname put obj$i /dev/null
+    done
+
+    # Identify PG and OSDs
+    local pgid=$(get_pg $poolname obj1)
+    local replicaosds=$(get_osds $poolname obj1 | awk '{print $2, $3}')
+    read -r osd_a osd_b <<< "$replicaosds"
+
+    # Capture baseline timestamp
+    local last_clean_start=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_clean')
+
+    # --- Step 1: Kill the first non-primary OSD (osd_a) ---
+    echo "Setting norecover to freeze PG state..."
+    ceph osd set norecover
+
+    echo "Stopping OSD.$osd_a..."
+    kill $(cat $dir/osd.${osd_a}.pid)
+    ceph osd down osd.${osd_a}
+    ceph osd out osd.${osd_a}
+
+    # 1.1 Wait and confirm state moves to degraded or undersized
+    local state=""
+    for i in $(seq 1 30); do
+      state=$(ceph pg $pgid query | jq -r '.info.stats.state')
+      echo "Current PG $pgid state: $state"
+      if [[ "$state" == *"degraded"* ]] || \
+         [[ "$state" == *"undersized"* ]]; then
+        break
+      fi
+      sleep 1
+    done
+
+    if [[ "$state" != *"degraded"* ]] && [[ "$state" != *"undersized"* ]]; then
+      echo "Error: PG $pgid state ($state) did not become " \
+           "degraded/undersized after killing osd.$osd_a."
+      return 1
+    fi
+
+    # 1.2 Confirm last_degraded updated
+    local last_degraded_t1=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Queried last_degraded (T1): $last_degraded_t1"
+    if [[ "$last_degraded_t1" > "$last_clean_start" ]]; then
+      echo "Confirmed: last_degraded ($last_degraded_t1) updated on failure."
+    else
+      echo "Error: last_degraded ($last_degraded_t1) is not newer than " \
+           "initial last_clean ($last_clean_start)."
+      return 1
+    fi
+
+    # --- Step 2: Kill the second non-primary OSD (osd_b) ---
+    echo "Stopping OSD.$osd_b..."
+    kill $(cat $dir/osd.${osd_b}.pid)
+    ceph osd down osd.${osd_b}
+    ceph osd out osd.${osd_b}
+
+    # 2.1 Confirm last_degraded remains latched (the same)
+    local last_degraded_t2=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Queried last_degraded (T2): $last_degraded_t2"
+    if [[ "$last_degraded_t2" == "$last_degraded_t1" ]]; then
+      echo "Test Passed: last_degraded timestamp remained " \
+           "stable at $last_degraded_t2."
+    else
+      echo "Test Failed: last_degraded updated to " \
+           "$last_degraded_t2 on second failure."
+      return 1
+    fi
+
+    # --- Step 3: Recovery ---
+    echo "Unsetting norecover and restarting OSDs..."
+    ceph osd unset norecover
+
+    echo "Restarting OSDs $osd_a and $osd_b..."
+    activate_osd $dir $osd_a
+    activate_osd $dir $osd_b
+    wait_for_clean || return 1
+
+    # --- Step 4: Final Verification ---
+    local final_stats=$(ceph pg $pgid query | \
+      jq -r '.info.stats | "\(.last_degraded) \(.last_clean)"')
+    read -r last_degraded_final last_clean_final <<< "$final_stats"
+
+    echo "Final Timestamps -> Last Degraded: $last_degraded_final, " \
+         "Last Clean: $last_clean_final"
+    if [[ "$last_clean_final" > "$last_degraded_final" ]]; then
+      echo "Test Passed: Recovery successful. last_clean ($last_clean_final) " \
+           "is newer than last_degraded ($last_degraded_final)."
+    else
+      echo "Test Failed: last_clean ($last_clean_final) was not updated " \
+           "correctly after recovery."
+      return 1
+    fi
+
+    # Cleanup
+    delete_pool $poolname
+    kill_daemons $dir || return 1
+}
+
+function TEST_recovery_last_degraded_undersized() {
+    local dir=$1
+    local osds=3
+
+    # 1. Setup Cluster
+    run_mon $dir a || return 1
+    run_mgr $dir x || return 1
+    for i in $(seq 0 $(expr $osds - 1)); do
+      run_osd $dir $i || return 1
+    done
+
+    # 2. Create Pool and force size 1
+    create_pool $poolname 8 8
+    ceph osd pool set $poolname size 1 --yes-i-really-mean-it
+    wait_for_clean || return 1
+
+    # Inject data
+    for i in $(seq 1 50); do
+      rados -p $poolname put obj$i /dev/null
+    done
+
+    local pgid=$(get_pg $poolname obj1)
+    local primary=$(get_primary $poolname obj1)
+
+    # 3. Select Non-Primary OSD
+    local replica_osd=""
+    for i in $(seq 0 $(expr $osds - 1)); do
+      if [[ "$i" != "$primary" ]]; then
+          replica_osd=$i
+          break
+      fi
+    done
+    echo "Primary is OSD.$primary, selected OSD.$replica_osd to mark OUT."
+
+    local last_clean_start=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_clean')
+
+    # 4. Mark non-primary OSD out and set norecover
+    ceph osd set norecover
+    ceph osd out $replica_osd
+
+    # 5. Increase pool size to 4
+    echo "Increasing pool size to 4..."
+    ceph osd pool set $poolname size 4
+
+    # 6. Unset norecover and kick the recovery queue
+    echo "Starting recovery..."
+    ceph osd unset norecover
+    ceph tell osd.$primary debug kick_recovery_wq 0
+
+    sleep 10
+    flush_pg_stats || return 1
+
+    # 7. Custom recovery-wait logic
+    echo "Waiting for $pgid to be marked undersized..."
+    for i in $(seq 1 300); do
+      # Fetch only the stats for the specific PG in JSON format
+      local current_state=$(ceph pg $pgid query | jq -r '.info.stats.state')
+      echo "Iteration $i: PG $pgid state is [$current_state]"
+
+      # Check if 'recovering' is absent from the state string
+      if [[ "$current_state" != *"recovering"* ]]; then
+        echo "PG $pgid is marked undersized (current state: $current_state)."
+        break
+      fi
+      if [ "$i" = "300" ]; then
+        echo "Timeout waiting for $pgid to become undersized"
+        ceph pg $pgid query | jq .
+        return 1
+      fi
+      sleep 1
+    done
+
+    # 8. Verification
+    local last_degraded_final=$(ceph pg $pgid query | \
+      jq -r '.info.stats.last_degraded')
+    echo "Initial Clean:  $last_clean_start"
+    echo "Final Degraded: $last_degraded_final"
+
+    if [[ "$last_degraded_final" > "$last_clean_start" ]]; then
+      echo "Test Passed: last_degraded updated correctly."
+    else
+      echo "Test Failed: last_degraded ($last_degraded_final) was not updated."
+      return 1
+    fi
+
+    # Cleanup
+    delete_pool $poolname
+    kill_daemons $dir || return 1
+}
+
  main osd-recovery-stats "$@"
  
  # Local Variables:
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc

index a6d12d1e01f3d492234b4c0676812eb6ba71966d..e0effaaa85d450720340baf01abfbe8862768d45 100644 (file)
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -4498,6 +4498,17 @@ std::optional<pg_stat_t> PeeringState::prepare_stats_for_publish(
      if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
        info.stats.last_fullsized = now;
  
+    // check if the PG is vulnerable
+    if (info.stats.state & (PG_STATE_DEGRADED|PG_STATE_UNDERSIZED)) {
+      // set last_degraded only if we are entering a new
+      // failure state and if it's older than last_clean
+      if (info.stats.last_degraded <= info.stats.last_clean) {
+        info.stats.last_degraded = now;
+      }
+    }
+    // update pre_publish so the change is sent immediately
+    pre_publish.last_degraded = info.stats.last_degraded;
+
      psdout(15) << "publish_stats_to_osd " << pre_publish.reported_epoch
                << ":" << pre_publish.reported_seq << dendl;
      return std::make_optional(std::move(pre_publish));
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc

index 873163fd9a227efdd65bd153679c6aa30ea5ff97..d9fe5491b8e236605ef5893b09cb2874529722d6 100644 (file)
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2947,6 +2947,7 @@ void pg_stat_t::dump(Formatter *f) const
    f->dump_stream("last_active") << last_active;
    f->dump_stream("last_peered") << last_peered;
    f->dump_stream("last_clean") << last_clean;
+  f->dump_stream("last_degraded") << last_degraded;
    f->dump_stream("last_became_active") << last_became_active;
    f->dump_stream("last_became_peered") << last_became_peered;
    f->dump_stream("last_unstale") << last_unstale;
@@ -3094,7 +3095,7 @@ bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r)
  
  void pg_stat_t::encode(ceph::buffer::list &bl) const
  {
-  ENCODE_START(30, 22, bl);
+  ENCODE_START(31, 22, bl);
    encode(version, bl);
    encode(reported_seq, bl);
    encode(reported_epoch, bl);
@@ -3157,6 +3158,7 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const
    encode(scrub_sched_status.m_osd_to_respond, bl);
    encode(scrub_sched_status.m_ordinal_of_requested_replica, bl);
    encode(scrub_sched_status.m_num_to_reserve, bl);
+  encode(last_degraded, bl);
  
    ENCODE_FINISH(bl);
  }
@@ -3165,7 +3167,7 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
  {
    bool tmp;
    uint32_t old_state;
-  DECODE_START(30, bl);
+  DECODE_START(31, bl);
    decode(version, bl);
    decode(reported_seq, bl);
    decode(reported_epoch, bl);
@@ -3267,6 +3269,11 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl)
      } else {
        scrub_sched_status.m_num_to_reserve = 0;
      }
+    if (struct_v >= 31) {
+      decode(last_degraded, bl);
+    } else {
+      last_degraded = last_clean;
+    }
    }
    DECODE_FINISH(bl);
  }
@@ -3290,6 +3297,7 @@ list<pg_stat_t> pg_stat_t::generate_test_instances()
    a.last_unstale = utime_t(1002, 5);
    a.last_undegraded = utime_t(1002, 7);
    a.last_fullsized = utime_t(1002, 8);
+  a.last_degraded = utime_t(1002, 9);
    a.log_start = eversion_t(1, 4);
    a.ondisk_log_start = eversion_t(1, 5);
    a.created = 6;
@@ -3328,6 +3336,7 @@ list<pg_stat_t> pg_stat_t::generate_test_instances()
    a.acting_primary = 124;
    a.blocked_by.push_back(155);
    a.blocked_by.push_back(156);
+  a.last_degraded = utime_t(1005, 1);
    o.push_back(pg_stat_t(a));
  
    return o;
@@ -3387,7 +3396,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r)
      l.objects_scrubbed == r.objects_scrubbed &&
      l.scrub_duration == r.scrub_duration &&
      l.objects_trimmed == r.objects_trimmed &&
-    l.snaptrim_duration == r.snaptrim_duration;
+    l.snaptrim_duration == r.snaptrim_duration &&
+    l.last_degraded == r.last_degraded;
  }
  
  // -- store_statfs_t --
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h

index 846ffc6edf481967ce11253bc60fa4be15a34e22..74226b60a2c967d61a75a82fcc8d8b84fc24d124 100644 (file)
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2333,6 +2333,7 @@ struct pg_stat_t {
    utime_t last_active;  // state & PG_STATE_ACTIVE
    utime_t last_peered;  // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED
    utime_t last_clean;   // state & PG_STATE_CLEAN
+  utime_t last_degraded; // state & (PG_STATE_DEGRADED | PG_STATE_UNDERSIZED)
    utime_t last_unstale; // (state & PG_STATE_STALE) == 0
    utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0
    utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0
author	Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
	Wed, 6 May 2026 15:11:33 +0000 (20:41 +0530)
committer	Sridhar Seshasayee <sridhar.seshasayee@ibm.com>
	Thu, 4 Jun 2026 14:14:56 +0000 (19:44 +0530)
PendingReleaseNotes		patch \| blob \| history
qa/standalone/osd/osd-recovery-stats.sh		patch \| blob \| history
src/osd/PeeringState.cc		patch \| blob \| history
src/osd/osd_types.cc		patch \| blob \| history
src/osd/osd_types.h		patch \| blob \| history