From: Nitzan Mordechai Date: Tue, 13 May 2025 10:00:45 +0000 (+0000) Subject: osd/PeeringState: re-evaluate full OSDs while waiting for recovery reservation X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b8d2c6832fbd1e630cf820079bc9639c5411772d;p=ceph.git osd/PeeringState: re-evaluate full OSDs while waiting for recovery reservation A PG can enter *Wait{Local,Remote}RecoveryReserved* before the OSD-map marks one of its acting OSDs as full, causing us to miss the recovery_toofull state and stall recovery until a later admin action. Add react(AdvMap) handlers to both wait states: on every new map, re-run check_full() and post RecoveryTooFull if needed; otherwise forward the event unchanged. Fixes: https://tracker.ceph.com/issues/70670 Signed-off-by: Nitzan Mordechai --- diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index de2d275ed87a..8caaf1e037cb 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -6004,6 +6004,18 @@ PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt) return transit(); } +boost::statechart::result +PeeringState::WaitLocalRecoveryReserved::react(const AdvMap& ev) +{ + DECLARE_LOCALS; + if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery && + ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) { + post_event(RecoveryTooFull()); + return discard_event(); + } + return forward_event(); +} + void PeeringState::WaitLocalRecoveryReserved::exit() { context< PeeringMachine >().log_exit(state_name, enter_time); @@ -6044,6 +6056,18 @@ PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &ev return discard_event(); } +boost::statechart::result +PeeringState::WaitRemoteRecoveryReserved::react(const AdvMap& ev) +{ + DECLARE_LOCALS; + if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery && + ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) { + post_event(RecoveryTooFull()); + return discard_event(); + } + return forward_event(); +} + void PeeringState::WaitRemoteRecoveryReserved::exit() { context< PeeringMachine >().log_exit(state_name, enter_time); diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 82c0082bb6e6..2f1dda828c4e 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1197,6 +1197,7 @@ public: std::set::const_iterator remote_recovery_reservation_it; explicit WaitRemoteRecoveryReserved(my_context ctx); boost::statechart::result react(const RemoteRecoveryReserved &evt); + boost::statechart::result react(const AdvMap& ev); void exit(); }; @@ -1208,6 +1209,7 @@ public: explicit WaitLocalRecoveryReserved(my_context ctx); void exit(); boost::statechart::result react(const RecoveryTooFull &evt); + boost::statechart::result react(const AdvMap& ev); }; struct Activating : boost::statechart::state< Activating, Active >, NamedState {