From 4fb3947ef0e711b88b3ba5277d6f50c5d153f675 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 19 Dec 2017 11:48:26 -0600 Subject: [PATCH] osd/PG: use local_reserver to schedule delete Use the reserver so that delete competes for the same slot(s) as recovery and such. Priority below recovery normally, unless the OSD is getting fullish, in which case we set a very high priority. We have to be careful here because backfill will back off when the OSD gets full(ish) but log recovery does not. Signed-off-by: Sage Weil --- src/osd/PG.cc | 83 ++++++++++++++++++++++++++++++++++++++++++--- src/osd/PG.h | 44 ++++++++++++++++++++---- src/osd/osd_types.h | 9 +++++ 3 files changed, 124 insertions(+), 12 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 9517f77046e..9f5b1b4c4a7 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2211,6 +2211,19 @@ unsigned PG::get_backfill_priority() return static_cast(ret); } +unsigned PG::get_delete_priority() +{ + auto state = get_osdmap()->get_state(osd->whoami); + if (state & (CEPH_OSD_NEARFULL | + CEPH_OSD_FULL)) { + return OSD_DELETE_PRIORITY_FULL; + } else if (state & CEPH_OSD_BACKFILLFULL) { + return OSD_DELETE_PRIORITY_FULLISH; + } else { + return OSD_DELETE_PRIORITY_NORMAL; + } +} + void PG::finish_recovery(list& tfin) { dout(10) << "finish_recovery" << dendl; @@ -6125,6 +6138,10 @@ void PG::_delete_some() osd->finish_pg_delete(this); deleted = true; + + // cancel reserver here, since the PG is about to get deleted and the + // exit() methods don't run when that happens. + osd->local_reserver.cancel_reservation(info.pgid); } } @@ -7964,24 +7981,80 @@ void PG::RecoveryState::Stray::exit() } -/*--------Deleting----------*/ +/*--------ToDelete----------*/ +PG::RecoveryState::ToDelete::ToDelete(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, "Started/ToDelete") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + pg->osd->logger->inc(l_osd_pg_removing); +} + +void PG::RecoveryState::ToDelete::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); + PG *pg = context< RecoveryMachine >().pg; + pg->osd->logger->dec(l_osd_pg_removing); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); +} + +/*----WaitDeleteReserved----*/ +PG::RecoveryState::WaitDeleteReserved::WaitDeleteReserved(my_context ctx) + : my_base(ctx), + NamedState(context< RecoveryMachine >().pg, + "Started/ToDelete/WaitDeleteReseved") +{ + context< RecoveryMachine >().log_enter(state_name); + PG *pg = context< RecoveryMachine >().pg; + context().priority = pg->get_delete_priority(); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + pg->osd->local_reserver.request_reservation( + pg->info.pgid, + new QueuePeeringEvt( + pg, pg->get_osdmap()->get_epoch(), + DeleteReserved()), + context().priority, + new QueuePeeringEvt( + pg, pg->get_osdmap()->get_epoch(), + DeleteInterrupted())); +} + +boost::statechart::result PG::RecoveryState::ToDelete::react( + const ActMap& evt) +{ + PG *pg = context< RecoveryMachine >().pg; + if (pg->get_delete_priority() != priority) { + ldout(pg->cct,10) << __func__ << " delete priority changed, resetting" + << dendl; + return transit(); + } + return discard_event(); +} + +void PG::RecoveryState::WaitDeleteReserved::exit() +{ + context< RecoveryMachine >().log_exit(state_name, enter_time); +} + +/*----Deleting-----*/ PG::RecoveryState::Deleting::Deleting(my_context ctx) : my_base(ctx), - NamedState(context< RecoveryMachine >().pg, "Started/Deleting") + NamedState(context< RecoveryMachine >().pg, "Started/ToDelete/Deleting") { context< RecoveryMachine >().log_enter(state_name); PG *pg = context< RecoveryMachine >().pg; pg->deleting = true; ObjectStore::Transaction* t = context().get_cur_transaction(); pg->on_removal(t); - pg->osd->logger->inc(l_osd_pg_removing); RecoveryCtx *rctx = context().get_recovery_ctx(); Context *fin = new C_DeleteMore(pg, pg->get_osdmap()->get_epoch()); rctx->on_applied->contexts.push_back(fin); rctx->on_safe->contexts.push_back(fin); } -boost::statechart::result PG::RecoveryState::Deleting::react(const DeleteSome& evt) +boost::statechart::result PG::RecoveryState::Deleting::react( + const DeleteSome& evt) { PG *pg = context< RecoveryMachine >().pg; pg->_delete_some(); @@ -7993,7 +8066,7 @@ void PG::RecoveryState::Deleting::exit() context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; pg->deleting = false; - pg->osd->logger->dec(l_osd_pg_removing); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); } /*--------GetInfo---------*/ diff --git a/src/osd/PG.h b/src/osd/PG.h index 8fe5da84af3..0a708a7a0b0 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1231,6 +1231,8 @@ protected: unsigned get_recovery_priority(); /// get backfill reservation priority unsigned get_backfill_priority(); + /// get priority for pg deletion + unsigned get_delete_priority(); void mark_clean(); ///< mark an active pg clean @@ -1870,6 +1872,8 @@ protected: TrivialEvent(DeleteStart) TrivialEvent(DeleteSome) + TrivialEvent(DeleteReserved) + TrivialEvent(DeleteInterrupted) /* Encapsulates PG recovery process */ class RecoveryState { @@ -1981,7 +1985,9 @@ protected: // RepWaitBackfillReserved // RepWaitRecoveryReserved // Stray - // Deleting + // ToDelete + // WaitDeleteReserved + // Deleting // Crashed struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState { @@ -2280,7 +2286,7 @@ protected: void exit(); }; - struct Deleting; + struct ToDelete; struct RepNotRecovering; struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState { explicit ReplicaActive(my_context ctx); @@ -2299,7 +2305,7 @@ protected: boost::statechart::custom_reaction< UnfoundBackfill >, boost::statechart::custom_reaction< RemoteBackfillPreempted >, boost::statechart::custom_reaction< RemoteRecoveryPreempted >, - boost::statechart::transition + boost::statechart::transition > reactions; boost::statechart::result react(const QueryState& q); boost::statechart::result react(const MInfoRec& infoevt); @@ -2461,7 +2467,7 @@ protected: boost::statechart::custom_reaction< MInfoRec >, boost::statechart::custom_reaction< ActMap >, boost::statechart::custom_reaction< RecoveryDone >, - boost::statechart::transition + boost::statechart::transition > reactions; boost::statechart::result react(const MQuery& query); boost::statechart::result react(const MLogRec& logevt); @@ -2472,15 +2478,39 @@ protected: } }; - struct Deleting : boost::statechart::state, NamedState { + struct WaitDeleteReserved; + struct ToDelete : boost::statechart::state, NamedState { + unsigned priority = 0; typedef boost::mpl::list < boost::statechart::custom_reaction< ActMap >, boost::statechart::custom_reaction< DeleteSome > > reactions; - explicit Deleting(my_context ctx); - boost::statechart::result react(const ActMap &evt) { + explicit ToDelete(my_context ctx); + boost::statechart::result react(const ActMap &evt); + boost::statechart::result react(const DeleteSome &evt) { + // happens if we drop out of Deleting due to reprioritization etc. return discard_event(); } + void exit(); + }; + + struct Deleting; + struct WaitDeleteReserved : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::transition + > reactions; + explicit WaitDeleteReserved(my_context ctx); + void exit(); + }; + + struct Deleting : boost::statechart::state, NamedState { + typedef boost::mpl::list < + boost::statechart::custom_reaction< DeleteSome >, + boost::statechart::transition + > reactions; + explicit Deleting(my_context ctx); boost::statechart::result react(const DeleteSome &evt); void exit(); }; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 2eb347eabaa..bdab163e902 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -87,6 +87,15 @@ /// max recovery priority for MBackfillReserve, only when forced manually #define OSD_RECOVERY_PRIORITY_FORCED 255 +/// priority for pg deletion when osd is not fullish +#define OSD_DELETE_PRIORITY_NORMAL 179 + +/// priority for pg deletion when osd is approaching full +#define OSD_DELETE_PRIORITY_FULLISH 219 + +/// priority when more full +#define OSD_DELETE_PRIORITY_FULL 255 + typedef hobject_t collection_list_handle_t; -- 2.39.5