From 82d5010e04fca6b0ea00bda298cb07f235d885a6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 Apr 2018 15:00:58 -0500 Subject: [PATCH] osd/PG: fix DeferRecovery vs AllReplicasRecovered race - DeferRecovery event queued by AsyncReserver due to preemption event. We are in Recovering state with RECOVERING bit set. - We finish recovery, clear RECOVERING state bit, and queue AllReplicasRecovered from PrimaryLogPG::start_recovery_ops() - DeferRecovery event arrives, moving us from Recovering -> NotRecovering - AllReplciasRecovered event arrives, crashing us. This is all hard to deal with because the events are queued and may arrive later. Solve the problem here by tolerating a delayed DeferRecovery event: if the RECOVERING pg state bit isn't set, ignore it (it's old). The async reserver cancel events are unpredictable. Fixes: http://tracker.ceph.com/issues/23860 Signed-off-by: Sage Weil (cherry picked from commit cfe59cf20c4b09aa7b25c3f9a724a01380699744) --- src/osd/PG.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 1558c901a02c6..9b3022e05f9d0 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -7143,6 +7143,12 @@ boost::statechart::result PG::RecoveryState::Recovering::react(const DeferRecovery &evt) { PG *pg = context< RecoveryMachine >().pg; + if (!pg->state_test(PG_STATE_RECOVERING)) { + // we may have finished recovery and have an AllReplicasRecovered + // event queued to move us to the next state. + ldout(pg->cct, 10) << "got defer recovery but not recovering" << dendl; + return discard_event(); + } ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl; pg->state_clear(PG_STATE_RECOVERING); pg->state_set(PG_STATE_RECOVERY_WAIT); -- 2.39.5