From 491745a1c53e3ff1d8b792946ad556974a232651 Mon Sep 17 00:00:00 2001
From: Ronen Friedman <rfriedma@redhat.com>
Date: Tue, 28 Nov 2023 09:46:01 -0600
Subject: [PATCH] osd/scrub: unhandled scrub backend errors should cause an
 abort

... as we do not have any mechanism to handle them.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
---
 src/osd/scrubber/pg_scrubber.cc   | 11 ++++++-----
 src/osd/scrubber/scrub_machine.cc | 23 +++--------------------
 src/osd/scrubber/scrub_machine.h  | 13 ++++---------
 3 files changed, 13 insertions(+), 34 deletions(-)
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc
index 06dff82b36126..9dd35d61871c0 100644
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1230,14 +1230,15 @@ int PgScrubber::build_replica_map_chunk()
     break;
 
     default:
-      // negative retval: build_scrub_map_chunk() signalled an error
+      // build_scrub_map_chunk() signalled an error
       // Pre-Pacific code ignored this option, treating it as a success.
-      // \todo Add an error flag in the returning message.
-      // \todo: must either abort, send a reply, or return some error message
+      // Now: "regular" I/O errors were already handled by the backend (by
+      // setting the error flag in the scrub-map). We are left with the
+      // "unknown" error case - and we have no mechanism to handle it.
+      // Thus - we must abort.
       dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
 	      << ret << dendl;
-      // only in debug mode for now:
-      assert(false && "backend error");
+      ceph_abort_msg("backend error");
       break;
   };
 
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
index c68a4a1119c1f..e0da61dde0f85 100644
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -299,18 +299,6 @@ ActiveScrubbing::~ActiveScrubbing()
   }
 }
 
-/*
- * The only source of an InternalError event as of now is the BuildMap state,
- * when encountering a backend error.
- * We kill the scrub and reset the FSM.
- */
-sc::result ActiveScrubbing::react(const InternalError&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << __func__ << dendl;
-  return transit<NotActive>();
-}
-
 // ----------------------- RangeBlocked -----------------------------------
 
 /*
@@ -535,18 +523,13 @@ BuildMap::BuildMap(my_context ctx)
 
   } else {
 
-    auto ret = scrbr->build_primary_map_chunk();
-
-    if (ret == -EINPROGRESS) {
+    // note that build_primary_map_chunk() may return -EINPROGRESS, but no
+    // other error value (as those errors would cause it to crash the OSD).
+    if (scrbr->build_primary_map_chunk() == -EINPROGRESS) {
       // must wait for the backend to finish. No specific event provided.
       // build_primary_map_chunk() has already requeued us.
       dout(20) << "waiting for the backend..." << dendl;
 
-    } else if (ret < 0) {
-
-      dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
-      post_event(InternalError{});
-
     } else {
 
       // the local map was created
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
index 42cf1b20f7488..df9a4c45e95f8 100644
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -156,8 +156,6 @@ MEV(GotReplicas)
 /// internal - BuildMap preempted. Required, as detected within the ctor
 MEV(IntBmPreempted)
 
-MEV(InternalError)
-
 MEV(IntLocalMapDone)
 
 /// external. called upon success of a MODIFY op. See
@@ -481,10 +479,6 @@ struct ActiveScrubbing
 
   explicit ActiveScrubbing(my_context ctx);
   ~ActiveScrubbing();
-
-  using reactions = mpl::list<sc::custom_reaction<InternalError>>;
-
-  sc::result react(const InternalError&);
 };
 
 struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing>, NamedSimply {
@@ -566,12 +560,13 @@ struct BuildMap : sc::state<BuildMap, ActiveScrubbing>, NamedSimply {
   explicit BuildMap(my_context ctx);
 
   // possible error scenarios:
-  // - an error reported by the backend will trigger an 'InternalError' event,
-  //   handled by our parent state;
+  // - an error reported by the backend will cause the scrubber to
+  //   ceph_abort() the OSD. No need to handle it here.
   // - if preempted, we switch to DrainReplMaps, where we will wait for all
   //   replicas to send their maps before acknowledging the preemption;
   // - an interval change will be handled by the relevant 'send-event'
-  //   functions, and will translated into a 'FullReset' event.
+  //   functions, translated into an IntervalChanged event (handled by
+  //   the 'Session' state).
   using reactions = mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
 			      // looping, waiting for the backend to finish:
 			      sc::transition<InternalSchedScrub, BuildMap>,
-- 
2.39.5