From efd1a7714767ce80ec0b20059cfb66f8b18be12d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Oct 2017 22:32:18 -0500 Subject: [PATCH] osd/PG: make scan recovery op cancellation match up reliably Previously, there was only one time we would end up in this region of code: when the backfill was rejected by the peer. Previously that was apparently reliably when we had an outstanding SCAN request, because we would unconditionally cancle the MAX recovery op and clear waiting_on_backfill. See 624aaf2a4ea9950153a89ff921e2adce683a6f51 for when this code appeared. Now we have several similar paths, and we don't always have an outstanding scan call (I don't think!). Regardless, move most these three cases into a common helper and make the finish_recovery_op completion conditional on whether there is an outstanding SCAN. This fixes a leak of a recovery op when we defer while a scan is outstanding (this bug was recently introduced by e708410542b0a52fbb29e14b76f49c94adbc0a59 and then duplicated by 2463c6463d1ed38a2e15a0960ed1530a47851489). Note that there is still one other time we register MAX ops: when we are finishing backfill. There, we start one per target. But we will always get back our reply and process it in the normal way (that old commit did not change the timing for these). Signed-off-by: Sage Weil --- src/osd/PG.cc | 87 +++++++++++---------------------------------------- src/osd/PG.h | 1 + 2 files changed, 19 insertions(+), 69 deletions(-) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 06c2767455be4..7a91811b45ee5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6412,16 +6412,11 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx) pg->publish_stats_to_osd(); } -boost::statechart::result -PG::RecoveryState::Backfilling::react(const DeferBackfill &c) +void PG::RecoveryState::Backfilling::cancel_backfill() { PG *pg = context< RecoveryMachine >().pg; - ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl; pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - pg->state_set(PG_STATE_BACKFILL_WAIT); - pg->state_clear(PG_STATE_BACKFILLING); - for (set::iterator it = pg->backfill_targets.begin(); it != pg->backfill_targets.end(); ++it) { @@ -6438,8 +6433,20 @@ PG::RecoveryState::Backfilling::react(const DeferBackfill &c) } } - pg->waiting_on_backfill.clear(); + if (!pg->waiting_on_backfill.empty()) { + pg->waiting_on_backfill.clear(); + pg->finish_recovery_op(hobject_t::get_max()); + } +} +boost::statechart::result +PG::RecoveryState::Backfilling::react(const DeferBackfill &c) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl; + pg->state_set(PG_STATE_BACKFILL_WAIT); + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); pg->schedule_backfill_retry(c.delay); return transit(); } @@ -6449,29 +6456,9 @@ PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c) { PG *pg = context< RecoveryMachine >().pg; ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); - pg->state_set(PG_STATE_BACKFILL_UNFOUND); pg->state_clear(PG_STATE_BACKFILLING); - - for (set::iterator it = pg->backfill_targets.begin(); - it != pg->backfill_targets.end(); - ++it) { - assert(*it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - it->osd, pg->get_osdmap()->get_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::RELEASE, - spg_t(pg->info.pgid.pgid, it->shard), - pg->get_osdmap()->get_epoch()), - con.get()); - } - } - - pg->waiting_on_backfill.clear(); - + cancel_backfill(); return transit(); } @@ -6479,28 +6466,9 @@ boost::statechart::result PG::RecoveryState::Backfilling::react(const RemoteReservationRevokedTooFull &) { PG *pg = context< RecoveryMachine >().pg; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); pg->state_set(PG_STATE_BACKFILL_TOOFULL); - - for (set::iterator it = pg->backfill_targets.begin(); - it != pg->backfill_targets.end(); - ++it) { - assert(*it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - it->osd, pg->get_osdmap()->get_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::RELEASE, - spg_t(pg->info.pgid.pgid, it->shard), - pg->get_osdmap()->get_epoch()), - con.get()); - } - } - - pg->waiting_on_backfill.clear(); - pg->finish_recovery_op(hobject_t::get_max()); - + pg->state_clear(PG_STATE_BACKFILLING); + cancel_backfill(); pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval); return transit(); } @@ -6509,27 +6477,8 @@ boost::statechart::result PG::RecoveryState::Backfilling::react(const RemoteReservationRevoked &) { PG *pg = context< RecoveryMachine >().pg; - pg->osd->local_reserver.cancel_reservation(pg->info.pgid); pg->state_set(PG_STATE_BACKFILL_WAIT); - - for (set::iterator it = pg->backfill_targets.begin(); - it != pg->backfill_targets.end(); - ++it) { - assert(*it != pg->pg_whoami); - ConnectionRef con = pg->osd->get_con_osd_cluster( - it->osd, pg->get_osdmap()->get_epoch()); - if (con) { - pg->osd->send_message_osd_cluster( - new MBackfillReserve( - MBackfillReserve::RELEASE, - spg_t(pg->info.pgid.pgid, it->shard), - pg->get_osdmap()->get_epoch()), - con.get()); - } - } - - pg->waiting_on_backfill.clear(); - + cancel_backfill(); return transit(); } diff --git a/src/osd/PG.h b/src/osd/PG.h index 2b9571d927600..a03deb2e906b9 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -2193,6 +2193,7 @@ protected: boost::statechart::result react(const RemoteReservationRevoked& evt); boost::statechart::result react(const DeferBackfill& evt); boost::statechart::result react(const UnfoundBackfill& evt); + void cancel_backfill(); void exit(); }; -- 2.39.5