From 491745a1c53e3ff1d8b792946ad556974a232651 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Tue, 28 Nov 2023 09:46:01 -0600 Subject: [PATCH] osd/scrub: unhandled scrub backend errors should cause an abort ... as we do not have any mechanism to handle them. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.cc | 11 ++++++----- src/osd/scrubber/scrub_machine.cc | 23 +++-------------------- src/osd/scrubber/scrub_machine.h | 13 ++++--------- 3 files changed, 13 insertions(+), 34 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index 06dff82b36126..9dd35d61871c0 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -1230,14 +1230,15 @@ int PgScrubber::build_replica_map_chunk() break; default: - // negative retval: build_scrub_map_chunk() signalled an error + // build_scrub_map_chunk() signalled an error // Pre-Pacific code ignored this option, treating it as a success. - // \todo Add an error flag in the returning message. - // \todo: must either abort, send a reply, or return some error message + // Now: "regular" I/O errors were already handled by the backend (by + // setting the error flag in the scrub-map). We are left with the + // "unknown" error case - and we have no mechanism to handle it. + // Thus - we must abort. dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret << dendl; - // only in debug mode for now: - assert(false && "backend error"); + ceph_abort_msg("backend error"); break; }; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index c68a4a1119c1f..e0da61dde0f85 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -299,18 +299,6 @@ ActiveScrubbing::~ActiveScrubbing() } } -/* - * The only source of an InternalError event as of now is the BuildMap state, - * when encountering a backend error. - * We kill the scrub and reset the FSM. - */ -sc::result ActiveScrubbing::react(const InternalError&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << __func__ << dendl; - return transit(); -} - // ----------------------- RangeBlocked ----------------------------------- /* @@ -535,18 +523,13 @@ BuildMap::BuildMap(my_context ctx) } else { - auto ret = scrbr->build_primary_map_chunk(); - - if (ret == -EINPROGRESS) { + // note that build_primary_map_chunk() may return -EINPROGRESS, but no + // other error value (as those errors would cause it to crash the OSD). + if (scrbr->build_primary_map_chunk() == -EINPROGRESS) { // must wait for the backend to finish. No specific event provided. // build_primary_map_chunk() has already requeued us. dout(20) << "waiting for the backend..." << dendl; - } else if (ret < 0) { - - dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; - post_event(InternalError{}); - } else { // the local map was created diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index 42cf1b20f7488..df9a4c45e95f8 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -156,8 +156,6 @@ MEV(GotReplicas) /// internal - BuildMap preempted. Required, as detected within the ctor MEV(IntBmPreempted) -MEV(InternalError) - MEV(IntLocalMapDone) /// external. called upon success of a MODIFY op. See @@ -481,10 +479,6 @@ struct ActiveScrubbing explicit ActiveScrubbing(my_context ctx); ~ActiveScrubbing(); - - using reactions = mpl::list>; - - sc::result react(const InternalError&); }; struct RangeBlocked : sc::state, NamedSimply { @@ -566,12 +560,13 @@ struct BuildMap : sc::state, NamedSimply { explicit BuildMap(my_context ctx); // possible error scenarios: - // - an error reported by the backend will trigger an 'InternalError' event, - // handled by our parent state; + // - an error reported by the backend will cause the scrubber to + // ceph_abort() the OSD. No need to handle it here. // - if preempted, we switch to DrainReplMaps, where we will wait for all // replicas to send their maps before acknowledging the preemption; // - an interval change will be handled by the relevant 'send-event' - // functions, and will translated into a 'FullReset' event. + // functions, translated into an IntervalChanged event (handled by + // the 'Session' state). using reactions = mpl::list, // looping, waiting for the backend to finish: sc::transition, -- 2.39.5