From: Sridhar Seshasayee Date: Wed, 6 May 2026 15:11:33 +0000 (+0530) Subject: osd: add last_degraded field to pg_stat_t X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=252d14923858b6695dad4a7d70f70ed3881abd28;p=ceph.git osd: add last_degraded field to pg_stat_t Introduce a 'last_degraded' timestamp to the pg_stat_t structure to track the initial point of redundancy loss. This field, used in conjunction with 'last_clean', allows the manager to calculate a cluster-wide durability score by measuring the duration of vulnerability windows. Changes: 1) Add last_degraded (utime_t) to pg_stat_t in osd_types.h. 2) Increment pg_stat_t encoding version to 31. The decode logic defaults last_degraded to last_clean for backward compatibility during rolling upgrades. 3) Update operator==, dump(), and generate_test_instances() to support ceph-dencoder testing and JSON output. 4) Implement latching logic in PeeringState::prepare_stats_for_publish(): - A PG is considered vulnerable if in DEGRADED or UNDERSIZED state. - last_degraded is set to 'now' only if it is <= last_clean, effectively latching the timestamp to the start of the failure event until the PG next becomes clean. 5) Standalone tests to verify: - The last_degraded timestamp latching logic. - Verify last_degraded timestamp is modified when OSDs are marked 'out' for draining purposes in which case PGs are marked undersized. 6) Release note the addition of 'last_degraded' field to PG stats. Fixes: https://tracker.ceph.com/issues/76604 Signed-off-by: Sridhar Seshasayee --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index d94e35fa6b5..e330cc903b9 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -115,6 +115,12 @@ invalidates entries when underlying cluster maps are updated. The cache is enabled by default. The previous `mgr_ttl_cache_expire_seconds` configuration option has been removed and replaced with `mgr_map_cache_enabled` (default: true). +* The ``last_degraded`` timestamp is added to the ``pg_stat_t`` structure to + track the initial point of redundancy loss when a PG enters an undersized or + degraded state. This timestamp is latched until the PG returns to a clean + state and a subsequent redundancy loss occurs. Used in conjunction with + ``last_clean``, the ``last_degraded`` timestamp enables the calculation of + data vulnerability and durability scores. >=20.0.0 diff --git a/qa/standalone/osd/osd-recovery-stats.sh b/qa/standalone/osd/osd-recovery-stats.sh index fd8087646dd..6a402d4914f 100755 --- a/qa/standalone/osd/osd-recovery-stats.sh +++ b/qa/standalone/osd/osd-recovery-stats.sh @@ -505,6 +505,217 @@ function TEST_recovery_multi() { kill_daemons $dir || return 1 } +function TEST_recovery_last_degraded_latching() { + local dir=$1 + local osds=6 + + # Setup Cluster + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for i in $(seq 0 $(expr $osds - 1)); do + run_osd $dir $i || return 1 + done + + # Create Pool with specific replica counts + create_pool $poolname 8 8 + ceph osd pool set $poolname size 3 + ceph osd pool set $poolname min_size 1 + wait_for_clean || return 1 + + # Inject data + local numobjs=100 + for i in $(seq 1 $numobjs); do + rados -p $poolname put obj$i /dev/null + done + + # Identify PG and OSDs + local pgid=$(get_pg $poolname obj1) + local replicaosds=$(get_osds $poolname obj1 | awk '{print $2, $3}') + read -r osd_a osd_b <<< "$replicaosds" + + # Capture baseline timestamp + local last_clean_start=$(ceph pg $pgid query | \ + jq -r '.info.stats.last_clean') + + # --- Step 1: Kill the first non-primary OSD (osd_a) --- + echo "Setting norecover to freeze PG state..." + ceph osd set norecover + + echo "Stopping OSD.$osd_a..." + kill $(cat $dir/osd.${osd_a}.pid) + ceph osd down osd.${osd_a} + ceph osd out osd.${osd_a} + + # 1.1 Wait and confirm state moves to degraded or undersized + local state="" + for i in $(seq 1 30); do + state=$(ceph pg $pgid query | jq -r '.info.stats.state') + echo "Current PG $pgid state: $state" + if [[ "$state" == *"degraded"* ]] || \ + [[ "$state" == *"undersized"* ]]; then + break + fi + sleep 1 + done + + if [[ "$state" != *"degraded"* ]] && [[ "$state" != *"undersized"* ]]; then + echo "Error: PG $pgid state ($state) did not become " \ + "degraded/undersized after killing osd.$osd_a." + return 1 + fi + + # 1.2 Confirm last_degraded updated + local last_degraded_t1=$(ceph pg $pgid query | \ + jq -r '.info.stats.last_degraded') + echo "Queried last_degraded (T1): $last_degraded_t1" + if [[ "$last_degraded_t1" > "$last_clean_start" ]]; then + echo "Confirmed: last_degraded ($last_degraded_t1) updated on failure." + else + echo "Error: last_degraded ($last_degraded_t1) is not newer than " \ + "initial last_clean ($last_clean_start)." + return 1 + fi + + # --- Step 2: Kill the second non-primary OSD (osd_b) --- + echo "Stopping OSD.$osd_b..." + kill $(cat $dir/osd.${osd_b}.pid) + ceph osd down osd.${osd_b} + ceph osd out osd.${osd_b} + + # 2.1 Confirm last_degraded remains latched (the same) + local last_degraded_t2=$(ceph pg $pgid query | \ + jq -r '.info.stats.last_degraded') + echo "Queried last_degraded (T2): $last_degraded_t2" + if [[ "$last_degraded_t2" == "$last_degraded_t1" ]]; then + echo "Test Passed: last_degraded timestamp remained " \ + "stable at $last_degraded_t2." + else + echo "Test Failed: last_degraded updated to " \ + "$last_degraded_t2 on second failure." + return 1 + fi + + # --- Step 3: Recovery --- + echo "Unsetting norecover and restarting OSDs..." + ceph osd unset norecover + + echo "Restarting OSDs $osd_a and $osd_b..." + activate_osd $dir $osd_a + activate_osd $dir $osd_b + wait_for_clean || return 1 + + # --- Step 4: Final Verification --- + local final_stats=$(ceph pg $pgid query | \ + jq -r '.info.stats | "\(.last_degraded) \(.last_clean)"') + read -r last_degraded_final last_clean_final <<< "$final_stats" + + echo "Final Timestamps -> Last Degraded: $last_degraded_final, " \ + "Last Clean: $last_clean_final" + if [[ "$last_clean_final" > "$last_degraded_final" ]]; then + echo "Test Passed: Recovery successful. last_clean ($last_clean_final) " \ + "is newer than last_degraded ($last_degraded_final)." + else + echo "Test Failed: last_clean ($last_clean_final) was not updated " \ + "correctly after recovery." + return 1 + fi + + # Cleanup + delete_pool $poolname + kill_daemons $dir || return 1 +} + +function TEST_recovery_last_degraded_undersized() { + local dir=$1 + local osds=3 + + # 1. Setup Cluster + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + for i in $(seq 0 $(expr $osds - 1)); do + run_osd $dir $i || return 1 + done + + # 2. Create Pool and force size 1 + create_pool $poolname 8 8 + ceph osd pool set $poolname size 1 --yes-i-really-mean-it + wait_for_clean || return 1 + + # Inject data + for i in $(seq 1 50); do + rados -p $poolname put obj$i /dev/null + done + + local pgid=$(get_pg $poolname obj1) + local primary=$(get_primary $poolname obj1) + + # 3. Select Non-Primary OSD + local replica_osd="" + for i in $(seq 0 $(expr $osds - 1)); do + if [[ "$i" != "$primary" ]]; then + replica_osd=$i + break + fi + done + echo "Primary is OSD.$primary, selected OSD.$replica_osd to mark OUT." + + local last_clean_start=$(ceph pg $pgid query | \ + jq -r '.info.stats.last_clean') + + # 4. Mark non-primary OSD out and set norecover + ceph osd set norecover + ceph osd out $replica_osd + + # 5. Increase pool size to 4 + echo "Increasing pool size to 4..." + ceph osd pool set $poolname size 4 + + # 6. Unset norecover and kick the recovery queue + echo "Starting recovery..." + ceph osd unset norecover + ceph tell osd.$primary debug kick_recovery_wq 0 + + sleep 10 + flush_pg_stats || return 1 + + # 7. Custom recovery-wait logic + echo "Waiting for $pgid to be marked undersized..." + for i in $(seq 1 300); do + # Fetch only the stats for the specific PG in JSON format + local current_state=$(ceph pg $pgid query | jq -r '.info.stats.state') + echo "Iteration $i: PG $pgid state is [$current_state]" + + # Check if 'recovering' is absent from the state string + if [[ "$current_state" != *"recovering"* ]]; then + echo "PG $pgid is marked undersized (current state: $current_state)." + break + fi + if [ "$i" = "300" ]; then + echo "Timeout waiting for $pgid to become undersized" + ceph pg $pgid query | jq . + return 1 + fi + sleep 1 + done + + # 8. Verification + local last_degraded_final=$(ceph pg $pgid query | \ + jq -r '.info.stats.last_degraded') + echo "Initial Clean: $last_clean_start" + echo "Final Degraded: $last_degraded_final" + + if [[ "$last_degraded_final" > "$last_clean_start" ]]; then + echo "Test Passed: last_degraded updated correctly." + else + echo "Test Failed: last_degraded ($last_degraded_final) was not updated." + return 1 + fi + + # Cleanup + delete_pool $poolname + kill_daemons $dir || return 1 +} + main osd-recovery-stats "$@" # Local Variables: diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index a6d12d1e01f..e0effaaa85d 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -4498,6 +4498,17 @@ std::optional PeeringState::prepare_stats_for_publish( if ((info.stats.state & PG_STATE_UNDERSIZED) == 0) info.stats.last_fullsized = now; + // check if the PG is vulnerable + if (info.stats.state & (PG_STATE_DEGRADED|PG_STATE_UNDERSIZED)) { + // set last_degraded only if we are entering a new + // failure state and if it's older than last_clean + if (info.stats.last_degraded <= info.stats.last_clean) { + info.stats.last_degraded = now; + } + } + // update pre_publish so the change is sent immediately + pre_publish.last_degraded = info.stats.last_degraded; + psdout(15) << "publish_stats_to_osd " << pre_publish.reported_epoch << ":" << pre_publish.reported_seq << dendl; return std::make_optional(std::move(pre_publish)); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 873163fd9a2..d9fe5491b8e 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2947,6 +2947,7 @@ void pg_stat_t::dump(Formatter *f) const f->dump_stream("last_active") << last_active; f->dump_stream("last_peered") << last_peered; f->dump_stream("last_clean") << last_clean; + f->dump_stream("last_degraded") << last_degraded; f->dump_stream("last_became_active") << last_became_active; f->dump_stream("last_became_peered") << last_became_peered; f->dump_stream("last_unstale") << last_unstale; @@ -3094,7 +3095,7 @@ bool operator==(const pg_scrubbing_status_t& l, const pg_scrubbing_status_t& r) void pg_stat_t::encode(ceph::buffer::list &bl) const { - ENCODE_START(30, 22, bl); + ENCODE_START(31, 22, bl); encode(version, bl); encode(reported_seq, bl); encode(reported_epoch, bl); @@ -3157,6 +3158,7 @@ void pg_stat_t::encode(ceph::buffer::list &bl) const encode(scrub_sched_status.m_osd_to_respond, bl); encode(scrub_sched_status.m_ordinal_of_requested_replica, bl); encode(scrub_sched_status.m_num_to_reserve, bl); + encode(last_degraded, bl); ENCODE_FINISH(bl); } @@ -3165,7 +3167,7 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) { bool tmp; uint32_t old_state; - DECODE_START(30, bl); + DECODE_START(31, bl); decode(version, bl); decode(reported_seq, bl); decode(reported_epoch, bl); @@ -3267,6 +3269,11 @@ void pg_stat_t::decode(ceph::buffer::list::const_iterator &bl) } else { scrub_sched_status.m_num_to_reserve = 0; } + if (struct_v >= 31) { + decode(last_degraded, bl); + } else { + last_degraded = last_clean; + } } DECODE_FINISH(bl); } @@ -3290,6 +3297,7 @@ list pg_stat_t::generate_test_instances() a.last_unstale = utime_t(1002, 5); a.last_undegraded = utime_t(1002, 7); a.last_fullsized = utime_t(1002, 8); + a.last_degraded = utime_t(1002, 9); a.log_start = eversion_t(1, 4); a.ondisk_log_start = eversion_t(1, 5); a.created = 6; @@ -3328,6 +3336,7 @@ list pg_stat_t::generate_test_instances() a.acting_primary = 124; a.blocked_by.push_back(155); a.blocked_by.push_back(156); + a.last_degraded = utime_t(1005, 1); o.push_back(pg_stat_t(a)); return o; @@ -3387,7 +3396,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r) l.objects_scrubbed == r.objects_scrubbed && l.scrub_duration == r.scrub_duration && l.objects_trimmed == r.objects_trimmed && - l.snaptrim_duration == r.snaptrim_duration; + l.snaptrim_duration == r.snaptrim_duration && + l.last_degraded == r.last_degraded; } // -- store_statfs_t -- diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 846ffc6edf4..74226b60a2c 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2333,6 +2333,7 @@ struct pg_stat_t { utime_t last_active; // state & PG_STATE_ACTIVE utime_t last_peered; // state & PG_STATE_ACTIVE || state & PG_STATE_PEERED utime_t last_clean; // state & PG_STATE_CLEAN + utime_t last_degraded; // state & (PG_STATE_DEGRADED | PG_STATE_UNDERSIZED) utime_t last_unstale; // (state & PG_STATE_STALE) == 0 utime_t last_undegraded; // (state & PG_STATE_DEGRADED) == 0 utime_t last_fullsized; // (state & PG_STATE_UNDERSIZED) == 0