From 0f0c3936689483b09c350f833087f06f00b19449 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Mon, 11 Mar 2024 12:54:01 -0500 Subject: [PATCH] osd/scrub: handle 'release' events sent during 'scrub abort' Scenario: - the replica is reserved; - the Primary initiates a chunk operation; - the replica is in ReplicaActive/ReplicaActiveOp/ReplicaBuildingMap - 'no-scrub' is set, and the Primary sends a 'release' event to the replica. Desired behavior: - the replica aborts the chunk operation and transitions to ReplicaReserved; - the 'release' event is delivered in the new state. Fixes: https://tracker.ceph.com/issues/64827 Signed-off-by: Ronen Friedman --- src/osd/scrubber/scrub_machine.cc | 10 +++++++++- src/osd/scrubber/scrub_machine.h | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index ce4196e1ebbb1..33da35cd8c85b 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -886,7 +886,7 @@ void ReplicaActive::clear_remote_reservation(bool warn_if_no_reservation) dout(10) << fmt::format( "ReplicaActive::clear_remote_reservation(): " "pending_reservation_nonce {}, reservation_granted {}", - reservation_granted, pending_reservation_nonce) + pending_reservation_nonce, reservation_granted) << dendl; if (reservation_granted || pending_reservation_nonce) { m_osds->get_scrub_reserver().cancel_reservation(pg_id); @@ -1149,6 +1149,14 @@ sc::result ReplicaActiveOp::react(const StartReplica&) return transit(); } +sc::result ReplicaActiveOp::react(const ReplicaRelease& ev) +{ + dout(10) << "ReplicaActiveOp::react(const ReplicaRelease&)" << dendl; + post_event(ev); + return transit>(); +} + + // ------------- ReplicaActive/ReplicaWaitUpdates ------------------------ ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 254e7861ed956..d56059c6bf87f 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -1047,6 +1047,7 @@ struct ReplicaActiveOp using reactions = mpl::list< sc::custom_reaction, + sc::custom_reaction, sc::transition>; /** @@ -1060,6 +1061,15 @@ struct ReplicaActiveOp * - and we should log this unexpected scenario clearly in the cluster log. */ sc::result react(const StartReplica&); + + /** + * a 'release' was send by the primary. Possible scenario: 'no-scrub' + * abort. Our two-steps reaction: + * - we exit the 'ActiveOp' state, and + * - we make sure the 'release' is remembered, to be handled by the state + * we would transition into (which should be ReplicaReserved). + */ + sc::result react(const ReplicaRelease&); }; /* -- 2.39.5