osd/scrub: handling unexpected scrub requests

author Ronen Friedman <rfriedma@redhat.com>

Thu, 30 Nov 2023 06:35:29 +0000 (00:35 -0600)

committer Ronen Friedman <rfriedma@redhat.com>

Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
author Ronen Friedman <rfriedma@redhat.com>
Thu, 30 Nov 2023 06:35:29 +0000 (00:35 -0600)
committer Ronen Friedman <rfriedma@redhat.com>
Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h

index 832fa17050cc767e095ab67b206f8c264087d8e5..3ac4d6dfdb74df847cf684973ba4268b41a7869e 100644 (file)
--- a/src/osd/scrubber/pg_scrubber.h
+++ b/src/osd/scrubber/pg_scrubber.h
@@ -673,10 +673,18 @@ class PgScrubber : public ScrubPgIF,
    /**
     * (replica) a tag identifying a specific replica operation, i.e. the
     * creation of the replica scrub map for a single chunk.
-   * Incremented immediately before sending a response to the primary,
-   * so that the next request would be identified as such. Also changed
-   * on reservation release.
-   * Used to identify stale scrub-re-sched messages triggered by the backend.
+   *
+   * Background: the backend is asynchronous, and the specific
+   * operations are size-limited. While the scrubber handles a specific
+   * request, it is continuously triggered to poll the backend for the
+   * full results for the chunk handled.
+   * Once the chunk request becomes obsolete, either following an interval
+   * change or if a new request was received, we must not send the stale
+   * data to the primary. The polling of the obsolete chunk request must
+   * stop, and the stale backend response should be discarded.
+   * In other words - the token should be read as saying "the primary has
+   * lost interest in the results of all operations identified by mismatched
+   * token values".
     */
    Scrub::act_token_t m_current_token{1};
  
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc

index cb10d87236b8ebf20461bfe76a1b4f3aabcc37bb..2e18ed5cf4e131d7bfbfb83f487a8c41e8b3fc90 100644 (file)
--- a/src/osd/scrubber/scrub_machine.cc
+++ b/src/osd/scrubber/scrub_machine.cc
@@ -797,6 +797,22 @@ ReplicaActiveOp::~ReplicaActiveOp()
    scrbr->replica_handling_done();
  }
  
+sc::result ReplicaActiveOp::react(const StartReplica&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActiveOp::react(const StartReplica&)" << dendl;
+
+  const auto msg = fmt::format(
+      "osd.{} pg[{}]: new chunk request while still handling the previous one",
+      scrbr->get_whoami(), scrbr->get_spgid());
+  dout(1) << msg << dendl;
+  scrbr->get_clog()->warn() << msg;
+
+  post_event(ReplicaPushesUpd{});
+
+  // exit & re-enter the state
+  return transit<ReplicaActiveOp>();
+}
  
  // ------------- ReplicaActive/ReplicaWaitUpdates ------------------------
  
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h

index 6dbc8b565e585508211c21338e4408cfd5201cf1..7e9c887145ae09aa26f4316fffcea81431fdaa72 100644 (file)
--- a/src/osd/scrubber/scrub_machine.h
+++ b/src/osd/scrubber/scrub_machine.h
@@ -704,6 +704,20 @@ struct ReplicaActiveOp
        NamedSimply {
    explicit ReplicaActiveOp(my_context ctx);
    ~ReplicaActiveOp();
+
+  using reactions = mpl::list<sc::custom_reaction<StartReplica>>;
+
+  /**
+   * Handling the unexpected (read - caused by a bug) case of receiving a
+   * new chunk request while still handling the previous one.
+   * To note:
+   * - the primary is evidently no longer waiting for the results of the
+   *   previous request. On the other hand
+   * - we must respond to the new request, as the primary would wait for
+   *   it "forever"`,
+   * - and we should log this unexpected scenario clearly in the cluster log.
+   */
+  sc::result react(const StartReplica&);
  };
  
  /*
author	Ronen Friedman <rfriedma@redhat.com>
	Thu, 30 Nov 2023 06:35:29 +0000 (00:35 -0600)
committer	Ronen Friedman <rfriedma@redhat.com>
	Thu, 30 Nov 2023 11:40:27 +0000 (05:40 -0600)
src/osd/scrubber/pg_scrubber.h		patch \| blob \| history
src/osd/scrubber/scrub_machine.cc		patch \| blob \| history
src/osd/scrubber/scrub_machine.h		patch \| blob \| history