osd/scrub: unhandled scrub backend errors should cause an abort

author Ronen Friedman <rfriedma@redhat.com>

Tue, 28 Nov 2023 15:46:01 +0000 (09:46 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Sun, 24 Dec 2023 12:54:03 +0000 (14:54 +0200)
author Ronen Friedman <rfriedma@redhat.com>
Tue, 28 Nov 2023 15:46:01 +0000 (09:46 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Sun, 24 Dec 2023 12:54:03 +0000 (14:54 +0200)
diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc

index 06dff82b36126d5447cdb48d28a1f266d30d8a01..9dd35d61871c0875bb81e503e805aece6cccba80 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.cc
+++ b/src/osd/scrubber/pg_scrubber.cc
@@ -1230,14 +1230,15 @@ int PgScrubber::build_replica_map_chunk()
      break;
  
      default:
-      // negative retval: build_scrub_map_chunk() signalled an error
+      // build_scrub_map_chunk() signalled an error
        // Pre-Pacific code ignored this option, treating it as a success.
-      // \todo Add an error flag in the returning message.
-      // \todo: must either abort, send a reply, or return some error message
+      // Now: "regular" I/O errors were already handled by the backend (by
+      // setting the error flag in the scrub-map). We are left with the
+      // "unknown" error case - and we have no mechanism to handle it.
+      // Thus - we must abort.
        dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
               << ret << dendl;
-      // only in debug mode for now:
-      assert(false && "backend error");
+      ceph_abort_msg("backend error");
        break;
    };
  
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index c68a4a1119c1f586bc61bd9db85dec6e781ecdc0..e0da61dde0f85dd723e6bd97de863c052180fec4 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -299,18 +299,6 @@ ActiveScrubbing::~ActiveScrubbing()
    }
  }
  
-/*
- * The only source of an InternalError event as of now is the BuildMap state,
- * when encountering a backend error.
- * We kill the scrub and reset the FSM.
- */
-sc::result ActiveScrubbing::react(const InternalError&)
-{
-  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
-  dout(10) << __func__ << dendl;
-  return transit<NotActive>();
-}
-
  // ----------------------- RangeBlocked -----------------------------------
  
  /*
@@ -535,18 +523,13 @@ BuildMap::BuildMap(my_context ctx)
  
    } else {
  
-    auto ret = scrbr->build_primary_map_chunk();
-
-    if (ret == -EINPROGRESS) {
+    // note that build_primary_map_chunk() may return -EINPROGRESS, but no
+    // other error value (as those errors would cause it to crash the OSD).
+    if (scrbr->build_primary_map_chunk() == -EINPROGRESS) {
        // must wait for the backend to finish. No specific event provided.
        // build_primary_map_chunk() has already requeued us.
        dout(20) << "waiting for the backend..." << dendl;
  
-    } else if (ret < 0) {
-
-      dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl;
-      post_event(InternalError{});
-
      } else {
  
        // the local map was created
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index 42cf1b20f7488bdea313ba6f9af648c55dbcb694..df9a4c45e95f8fea8f12036321c8410e06fb28f6 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -156,8 +156,6 @@ MEV(GotReplicas)
  /// internal - BuildMap preempted. Required, as detected within the ctor
  MEV(IntBmPreempted)
  
-MEV(InternalError)
-
  MEV(IntLocalMapDone)
  
  /// external. called upon success of a MODIFY op. See
@@ -481,10 +479,6 @@ struct ActiveScrubbing
  
    explicit ActiveScrubbing(my_context ctx);
    ~ActiveScrubbing();
-
-  using reactions = mpl::list<sc::custom_reaction<InternalError>>;
-
-  sc::result react(const InternalError&);
  };
  
  struct RangeBlocked : sc::state<RangeBlocked, ActiveScrubbing>, NamedSimply {
@@ -566,12 +560,13 @@ struct BuildMap : sc::state<BuildMap, ActiveScrubbing>, NamedSimply {
    explicit BuildMap(my_context ctx);
  
    // possible error scenarios:
-  // - an error reported by the backend will trigger an 'InternalError' event,
-  //   handled by our parent state;
+  // - an error reported by the backend will cause the scrubber to
+  //   ceph_abort() the OSD. No need to handle it here.
    // - if preempted, we switch to DrainReplMaps, where we will wait for all
    //   replicas to send their maps before acknowledging the preemption;
    // - an interval change will be handled by the relevant 'send-event'
-  //   functions, and will translated into a 'FullReset' event.
+  //   functions, translated into an IntervalChanged event (handled by
+  //   the 'Session' state).
    using reactions = mpl::list<sc::transition<IntBmPreempted, DrainReplMaps>,
                               // looping, waiting for the backend to finish:
                               sc::transition<InternalSchedScrub, BuildMap>,
author	Ronen Friedman <rfriedma@redhat.com>
	Tue, 28 Nov 2023 15:46:01 +0000 (09:46 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Sun, 24 Dec 2023 12:54:03 +0000 (14:54 +0200)
src/osd/scrubber/pg_scrubber.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history