From: David Zafman Date: Mon, 8 May 2017 18:29:55 +0000 (-0700) Subject: osd: Cancel backfill when can't proceed due to errors X-Git-Tag: ses5-milestone8~1^2~19^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e708410542b0a52fbb29e14b76f49c94adbc0a59;p=ceph.git osd: Cancel backfill when can't proceed due to errors Add new transition CancelBackfill (Backfilling -> NotBackfilling) When giving up on backfill due to errors use new transition which includes scheduling retry of backfill. Signed-off-by: David Zafman --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index fb77b0777bca..bfee81253888 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -9124,9 +9124,16 @@ void OSD::do_recovery( if (!more && pg->have_unfound()) { pg->discover_all_missing(*rctx.query_map); if (rctx.query_map->empty()) { - dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl; + dout(10) << __func__ << ": no luck, giving up on this pg for now" << dendl; + if (pg->state_test(PG_STATE_BACKFILL)) { + auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt( + queued, + queued, + PG::CancelBackfill())); + pg->queue_peering_event(evt); + } } else { - dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl; + dout(10) << __func__ << ": no luck, giving up on this pg for now" << dendl; pg->queue_recovery(); } } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 39b373fd478a..0ea8abadb535 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6273,6 +6273,37 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx) pg->publish_stats_to_osd(); } +boost::statechart::result +PG::RecoveryState::Backfilling::react(const CancelBackfill &) +{ + PG *pg = context< RecoveryMachine >().pg; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + // XXX: Add a new pg state so user can see why backfill isn't proceeding + // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations + //pg->state_set(PG_STATE_BACKFILL_STALLED????); + + for (set::iterator it = pg->backfill_targets.begin(); + it != pg->backfill_targets.end(); + ++it) { + assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap()->get_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::REJECT, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap()->get_epoch()), + con.get()); + } + } + + pg->waiting_on_backfill.clear(); + + pg->schedule_backfill_full_retry(); + return transit(); +} + boost::statechart::result PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &) { diff --git a/src/osd/PG.h b/src/osd/PG.h index 8b3fef6d3965..0923c0570ce0 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1560,6 +1560,7 @@ public: TrivialEvent(LocalBackfillReserved) TrivialEvent(RemoteBackfillReserved) TrivialEvent(RemoteReservationRejected) + TrivialEvent(CancelBackfill) TrivialEvent(RequestBackfill) TrivialEvent(RequestRecovery) TrivialEvent(RecoveryDone) @@ -1871,10 +1872,12 @@ public: struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState { typedef boost::mpl::list< boost::statechart::transition< Backfilled, Recovered >, + boost::statechart::custom_reaction< CancelBackfill >, boost::statechart::custom_reaction< RemoteReservationRejected > > reactions; explicit Backfilling(my_context ctx); boost::statechart::result react(const RemoteReservationRejected& evt); + boost::statechart::result react(const CancelBackfill& evt); void exit(); };