From 9802b770f835ea75ea8cae7baaea70d75ce73048 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Thu, 30 Nov 2023 00:35:29 -0600 Subject: [PATCH] osd/scrub: handling unexpected scrub requests if arriving while still handling a previous chunk request, the handling of of the previous chunk request will be aborted. No response is sent. The scrubber resets, then handles the new request. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.h | 16 ++++++++++++---- src/osd/scrubber/scrub_machine.cc | 16 ++++++++++++++++ src/osd/scrubber/scrub_machine.h | 14 ++++++++++++++ 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 832fa17050c..3ac4d6dfdb7 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -673,10 +673,18 @@ class PgScrubber : public ScrubPgIF, /** * (replica) a tag identifying a specific replica operation, i.e. the * creation of the replica scrub map for a single chunk. - * Incremented immediately before sending a response to the primary, - * so that the next request would be identified as such. Also changed - * on reservation release. - * Used to identify stale scrub-re-sched messages triggered by the backend. + * + * Background: the backend is asynchronous, and the specific + * operations are size-limited. While the scrubber handles a specific + * request, it is continuously triggered to poll the backend for the + * full results for the chunk handled. + * Once the chunk request becomes obsolete, either following an interval + * change or if a new request was received, we must not send the stale + * data to the primary. The polling of the obsolete chunk request must + * stop, and the stale backend response should be discarded. + * In other words - the token should be read as saying "the primary has + * lost interest in the results of all operations identified by mismatched + * token values". */ Scrub::act_token_t m_current_token{1}; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index cb10d87236b..2e18ed5cf4e 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -797,6 +797,22 @@ ReplicaActiveOp::~ReplicaActiveOp() scrbr->replica_handling_done(); } +sc::result ReplicaActiveOp::react(const StartReplica&) +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReplicaActiveOp::react(const StartReplica&)" << dendl; + + const auto msg = fmt::format( + "osd.{} pg[{}]: new chunk request while still handling the previous one", + scrbr->get_whoami(), scrbr->get_spgid()); + dout(1) << msg << dendl; + scrbr->get_clog()->warn() << msg; + + post_event(ReplicaPushesUpd{}); + + // exit & re-enter the state + return transit(); +} // ------------- ReplicaActive/ReplicaWaitUpdates ------------------------ diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 6dbc8b565e5..7e9c887145a 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -704,6 +704,20 @@ struct ReplicaActiveOp NamedSimply { explicit ReplicaActiveOp(my_context ctx); ~ReplicaActiveOp(); + + using reactions = mpl::list>; + + /** + * Handling the unexpected (read - caused by a bug) case of receiving a + * new chunk request while still handling the previous one. + * To note: + * - the primary is evidently no longer waiting for the results of the + * previous request. On the other hand + * - we must respond to the new request, as the primary would wait for + * it "forever"`, + * - and we should log this unexpected scenario clearly in the cluster log. + */ + sc::result react(const StartReplica&); }; /* -- 2.39.5