From 0dc6297d09b15601ded4398fef8cf766d2c74f95 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 9 May 2017 14:35:58 -0700 Subject: [PATCH] osd: Cancel recovering when no more progress can be made Add new CancelRecovery transition (Recovering -> NotRecovering) When giving up on recovery due to errors use new transition which includes scheduling retry of recovery. Signed-off-by: David Zafman --- src/osd/OSD.cc | 17 ++++++++++++++--- src/osd/PG.cc | 15 +++++++++++++-- src/osd/PG.h | 5 ++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index bfee81253888..79c87fa47387 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -9124,16 +9124,27 @@ void OSD::do_recovery( if (!more && pg->have_unfound()) { pg->discover_all_missing(*rctx.query_map); if (rctx.query_map->empty()) { - dout(10) << __func__ << ": no luck, giving up on this pg for now" << dendl; + string action; if (pg->state_test(PG_STATE_BACKFILL)) { auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt( queued, queued, PG::CancelBackfill())); pg->queue_peering_event(evt); - } + action = "in backfill"; + } else if (pg->state_test(PG_STATE_RECOVERING)) { + auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt( + queued, + queued, + PG::CancelRecovery())); + pg->queue_peering_event(evt); + action = "in recovery"; + } else { + action = "already out of recovery/backfill"; + } + dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl; } else { - dout(10) << __func__ << ": no luck, giving up on this pg for now" << dendl; + dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl; pg->queue_recovery(); } } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 0ea8abadb535..5fb3833bd118 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6774,10 +6774,10 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx) pg->queue_recovery(); } -void PG::RecoveryState::Recovering::release_reservations() +void PG::RecoveryState::Recovering::release_reservations(bool cancel) { PG *pg = context< RecoveryMachine >().pg; - assert(!pg->pg_log.get_missing().have_missing()); + assert(cancel || !pg->pg_log.get_missing().have_missing()); // release remote reservations for (set::const_iterator i = @@ -6817,6 +6817,17 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt) return transit(); } +boost::statechart::result +PG::RecoveryState::Recovering::react(const CancelRecovery &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_RECOVERING); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + pg->schedule_recovery_full_retry(); + return transit(); +} + void PG::RecoveryState::Recovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); diff --git a/src/osd/PG.h b/src/osd/PG.h index 0923c0570ce0..ef39da8edc0b 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1566,6 +1566,7 @@ public: TrivialEvent(RecoveryDone) TrivialEvent(BackfillTooFull) TrivialEvent(RecoveryTooFull) + TrivialEvent(CancelRecovery) TrivialEvent(AllReplicasRecovered) TrivialEvent(DoRecovery) @@ -1988,12 +1989,14 @@ public: struct Recovering : boost::statechart::state< Recovering, Active >, NamedState { typedef boost::mpl::list < boost::statechart::custom_reaction< AllReplicasRecovered >, + boost::statechart::custom_reaction< CancelRecovery >, boost::statechart::custom_reaction< RequestBackfill > > reactions; explicit Recovering(my_context ctx); void exit(); - void release_reservations(); + void release_reservations(bool cancel = false); boost::statechart::result react(const AllReplicasRecovered &evt); + boost::statechart::result react(const CancelRecovery& evt); boost::statechart::result react(const RequestBackfill &evt); }; -- 2.47.3