]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/PeeringState: re-evaluate full OSDs while waiting for recovery reservation 63296/head
authorNitzan Mordechai <nmordech@redhat.com>
Tue, 13 May 2025 10:00:45 +0000 (10:00 +0000)
committerNitzan Mordechai <nmordech@redhat.com>
Sun, 18 May 2025 05:20:52 +0000 (05:20 +0000)
A PG can enter *Wait{Local,Remote}RecoveryReserved* before the OSD-map
marks one of its acting OSDs as full, causing us to miss the
recovery_toofull state and stall recovery until a later admin action.

Add react(AdvMap) handlers to both wait states: on every new map,
re-run check_full() and post RecoveryTooFull if needed; otherwise
forward the event unchanged.

Fixes: https://tracker.ceph.com/issues/70670
Signed-off-by: Nitzan Mordechai <nmordec@redhat.com>
src/osd/PeeringState.cc
src/osd/PeeringState.h

index de2d275ed87a0d79d89b4ed1a03650226e7ba0a7..8caaf1e037cbd0b82b7dfef8370b02fe7dcdc194 100644 (file)
@@ -6004,6 +6004,18 @@ PeeringState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
   return transit<NotRecovering>();
 }
 
+boost::statechart::result
+PeeringState::WaitLocalRecoveryReserved::react(const AdvMap& ev)
+{
+  DECLARE_LOCALS;
+  if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
+      ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
+    post_event(RecoveryTooFull());
+    return discard_event();
+  }
+  return forward_event();
+}
+
 void PeeringState::WaitLocalRecoveryReserved::exit()
 {
   context< PeeringMachine >().log_exit(state_name, enter_time);
@@ -6044,6 +6056,18 @@ PeeringState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &ev
   return discard_event();
 }
 
+boost::statechart::result
+PeeringState::WaitRemoteRecoveryReserved::react(const AdvMap& ev)
+{
+  DECLARE_LOCALS;
+  if (!ps->cct->_conf->osd_debug_skip_full_check_in_recovery &&
+      ps->get_osdmap()->check_full(ps->acting_recovery_backfill)) {
+    post_event(RecoveryTooFull());
+    return discard_event();
+  }
+  return forward_event();
+}
+
 void PeeringState::WaitRemoteRecoveryReserved::exit()
 {
   context< PeeringMachine >().log_exit(state_name, enter_time);
index 82c0082bb6e668921dce6127ba6bd19e827e6df0..2f1dda828c4e516d33014a2dcd328d2c5a506267 100644 (file)
@@ -1197,6 +1197,7 @@ public:
     std::set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
     explicit WaitRemoteRecoveryReserved(my_context ctx);
     boost::statechart::result react(const RemoteRecoveryReserved &evt);
+    boost::statechart::result react(const AdvMap& ev);
     void exit();
   };
 
@@ -1208,6 +1209,7 @@ public:
     explicit WaitLocalRecoveryReserved(my_context ctx);
     void exit();
     boost::statechart::result react(const RecoveryTooFull &evt);
+    boost::statechart::result react(const AdvMap& ev);
   };
 
   struct Activating : boost::statechart::state< Activating, Active >, NamedState {