]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: tag replica scrub messages to identify stale events
authorRonen Friedman <rfriedma@redhat.com>
Thu, 5 Aug 2021 07:53:11 +0000 (07:53 +0000)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 16 Mar 2022 08:45:26 +0000 (08:45 +0000)
A lot can happen while a replica is waiting for the backend
to collect the map data. The Primary might, for example, abort
the scrub and start a new one (following no-scrub commands).

The 'token' introduced here tags 'replica scrub resched'
messages with an index value that is modified on each 'release
scrub resources' request from the Primary.

Fixes: https://tracker.ceph.com/issues/52012
Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
(cherry picked from commit 6ff3a8274a72f4052f343b284cc84d25c4796d0b)

src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/pg_scrubber.cc
src/osd/pg_scrubber.h
src/osd/scheduler/OpSchedulerItem.cc
src/osd/scheduler/OpSchedulerItem.h
src/osd/scrubber_common.h

index 4093e1d4c98744ac1d92fd2f03d611bb970f9b4f..7aa3d2048e421891f245799c12135409ce12336b 100644 (file)
@@ -1747,11 +1747,13 @@ void OSDService::queue_for_snap_trim(PG *pg)
 template <class MSG_TYPE>
 void OSDService::queue_scrub_event_msg(PG* pg,
                                       Scrub::scrub_prio_t with_priority,
-                                      unsigned int qu_priority)
+                                      unsigned int qu_priority,
+                                      Scrub::act_token_t act_token)
 {
   const auto epoch = pg->get_osdmap_epoch();
-  auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
-  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
+  auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
+  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
+           << ". Epoch: " << epoch << " token: " << act_token << dendl;
 
   enqueue_back(OpSchedulerItem(
     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
@@ -1759,7 +1761,8 @@ void OSDService::queue_scrub_event_msg(PG* pg,
 }
 
 template <class MSG_TYPE>
-void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
+void OSDService::queue_scrub_event_msg(PG* pg,
+                                       Scrub::scrub_prio_t with_priority)
 {
   const auto epoch = pg->get_osdmap_epoch();
   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
@@ -1782,17 +1785,20 @@ void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_prior
 
 void OSDService::queue_for_rep_scrub(PG* pg,
                                     Scrub::scrub_prio_t with_priority,
-                                    unsigned int qu_priority)
+                                    unsigned int qu_priority,
+                                    Scrub::act_token_t act_token)
 {
-  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
+  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
 }
 
 void OSDService::queue_for_rep_scrub_resched(PG* pg,
                                             Scrub::scrub_prio_t with_priority,
-                                            unsigned int qu_priority)
+                                            unsigned int qu_priority,
+                                            Scrub::act_token_t act_token)
 {
   // Resulting scrub event: 'SchedReplica'
-  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
+  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
+                                          act_token);
 }
 
 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
index 15e2bde90119a905cce9dd4b573b9e2df774efed..efbcb40f799e0d5a857e5617a050283b47e2f1b5 100644 (file)
@@ -639,16 +639,19 @@ public:
 
   void queue_for_rep_scrub(PG* pg,
                           Scrub::scrub_prio_t with_high_priority,
-                          unsigned int qu_priority);
+                          unsigned int qu_priority,
+                          Scrub::act_token_t act_token);
 
   /// Signals a change in the number of in-flight recovery writes
   void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
 
-  /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to trigger
-  /// a re-check of the availability of the scrub map prepared by the backend.
+  /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to
+  /// trigger a re-check of the availability of the scrub map prepared by the
+  /// backend.
   void queue_for_rep_scrub_resched(PG* pg,
                                   Scrub::scrub_prio_t with_high_priority,
-                                  unsigned int qu_priority);
+                                  unsigned int qu_priority,
+                                  Scrub::act_token_t act_token);
 
   void queue_for_pg_delete(spg_t pgid, epoch_t e);
   bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
@@ -662,7 +665,8 @@ private:
   template <class MSG_TYPE>
   void queue_scrub_event_msg(PG* pg,
                             Scrub::scrub_prio_t with_priority,
-                            unsigned int qu_priority);
+                            unsigned int qu_priority,
+                            Scrub::act_token_t act_token);
 
   /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
   /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
index 92adc6aa8e7eebf9cfaa76babd62d8fdf3d65bac..b3fb71c4f6253b181f30c112d61e1e23ad1a227f 100644 (file)
@@ -2085,6 +2085,23 @@ void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view
   }
 }
 
+void PG::forward_scrub_event(ScrubSafeAPI fn,
+                            epoch_t epoch_queued,
+                            Scrub::act_token_t act_token,
+                            std::string_view desc)
+{
+  dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued
+          << " token: " << act_token << dendl;
+  if (is_active() && m_scrubber) {
+    ((*m_scrubber).*fn)(epoch_queued, act_token);
+  } else {
+    // pg might be in the process of being deleted
+    dout(5) << __func__ << " refusing to forward. "
+           << (is_clean() ? "(clean) " : "(not clean) ")
+           << (is_active() ? "(active) " : "(not active) ") << dendl;
+  }
+}
+
 void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
 {
   dout(10) << __func__ << " (op)" << dendl;
@@ -2093,12 +2110,14 @@ void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
 }
 
 void PG::replica_scrub(epoch_t epoch_queued,
+                      Scrub::act_token_t act_token,
                       [[maybe_unused]] ThreadPool::TPHandle& handle)
 {
   dout(10) << __func__ << " queued at: " << epoch_queued
           << (is_primary() ? " (primary)" : " (replica)") << dendl;
   scrub_queued = false;
-  forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, "StartReplica/nw"sv);
+  forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token,
+                     "StartReplica/nw");
 }
 
 bool PG::ops_blocked_by_scrub() const
index 8d2a102d5abc4d3425f7a1e6fced05c84f7d41fd..544bb9481bcb10a7e9ada921871161f41164ed2b 100644 (file)
@@ -400,12 +400,17 @@ public:
                        "AfterRepairScrub"sv);
   }
 
-  void replica_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void replica_scrub(epoch_t queued,
+                    Scrub::act_token_t act_token,
+                    ThreadPool::TPHandle& handle);
 
-  void replica_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
+  void replica_scrub_resched(epoch_t queued,
+                            Scrub::act_token_t act_token,
+                            ThreadPool::TPHandle& handle)
   {
     scrub_queued = false;
-    forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, "SchedReplica"sv);
+    forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, act_token,
+                       "SchedReplica");
   }
 
   void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
@@ -657,8 +662,14 @@ private:
                                  requested_scrub_t& planned) const;
 
   using ScrubAPI = void (ScrubPgIF::*)(epoch_t epoch_queued);
-
   void forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc);
+  // and for events that carry a meaningful 'activation token'
+  using ScrubSafeAPI = void (ScrubPgIF::*)(epoch_t epoch_queued,
+                                          Scrub::act_token_t act_token);
+  void forward_scrub_event(ScrubSafeAPI fn,
+                          epoch_t epoch_queued,
+                          Scrub::act_token_t act_token,
+                          std::string_view desc);
 
 public:
   virtual void do_request(
index 55e45b512738e07871d385cb211cb3262a9963b8..8fb61d4c7a3bf44b62959912d357b73bd37df7db 100644 (file)
@@ -215,15 +215,17 @@ void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
-void PgScrubber::send_start_replica(epoch_t epoch_queued)
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
 {
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
   if (is_primary()) {
     // shouldn't happen. Ignore
     dout(1) << "got a replica scrub request while Primary!" << dendl;
     return;
   }
-  if (check_interval(epoch_queued)) {
+
+  if (check_interval(epoch_queued) && is_token_current(token)) {
     m_fsm->my_states();
     // save us some time by not waiting for updates if there are none
     // to wait for. Affects the transition from NotActive into either
@@ -236,10 +238,11 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
-void PgScrubber::send_sched_replica(epoch_t epoch_queued)
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
 {
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
+  if (check_interval(epoch_queued) && is_token_current(token)) {
     m_fsm->my_states();
     m_fsm->process_event(SchedReplica{});  // retest for map availability
   }
@@ -485,9 +488,8 @@ void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
 
 void PgScrubber::unreg_next_scrub()
 {
-  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
-          << is_scrub_registered() << dendl;
   if (is_scrub_registered()) {
+    dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
     m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
     m_scrub_reg_stamp = utime_t{};
   }
@@ -1028,7 +1030,8 @@ int PgScrubber::build_primary_map_chunk()
 int PgScrubber::build_replica_map_chunk()
 {
   dout(10) << __func__ << " interval start: " << m_interval_start
-          << " epoch: " << m_epoch_start << " deep: " << m_is_deep << dendl;
+          << " current token: " << m_current_token << " epoch: " << m_epoch_start
+          << " deep: " << m_is_deep << dendl;
 
   auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
                                   m_is_deep);
@@ -1040,7 +1043,7 @@ int PgScrubber::build_replica_map_chunk()
       // (note: previous version used low priority here. Now switched to using the
       // priority of the original message)
       m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
-                                         m_flags.priority);
+                                         m_flags.priority, m_current_token);
       break;
 
     case 0: {
@@ -1232,7 +1235,8 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   // make sure the FSM is at NotActive
   m_fsm->assert_not_active();
 
-  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority);
+  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
+                             m_current_token);
 }
 
 void PgScrubber::set_op_parameters(requested_scrub_t& request)
@@ -1464,7 +1468,9 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
    *  otherwise the interval would have changed.
    *  Ostensibly we can discard & redo the reservation. But then we
    *  will be temporarily releasing the OSD resource - and might not be able to grab it
-   *  again. Thus, we simply treat this as a successful new request.
+   *  again. Thus, we simply treat this as a successful new request
+   *  (but mark the fact that if there is a previous request from the primary to
+   *  scrub a specific chunk - that request is now defunct).
    */
 
   if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
@@ -1482,6 +1488,15 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
   if (m_remote_osd_resource.has_value()) {
 
     dout(10) << __func__ << " already reserved." << dendl;
+
+    /*
+     * it might well be that we did not yet finish handling the latest scrub-op from
+     * our primary. This happens, for example, if 'noscrub' was set via a command, then
+     * reset. The primary in this scenario will remain in the same interval, but we do need
+     * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+     * from the primary will see us in active state, crashing the OSD).
+     */
+    advance_token();
     granted = true;
 
   } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
@@ -1533,6 +1548,12 @@ void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
 {
   dout(10) << __func__ << " " << *op->get_req() << dendl;
   op->mark_started();
+
+  /*
+   * this specific scrub session has terminated. All incoming events carrying the old
+   * tag will be discarded.
+   */
+  advance_token();
   m_remote_osd_resource.reset();
 }
 
@@ -2004,6 +2025,30 @@ void PgScrubber::reset_internal_state()
   m_active = false;
 }
 
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+  dout(10) << __func__ << " was: " << m_current_token << dendl;
+  m_current_token++;
+
+  // when advance_token() is called, it is assumed that no scrubbing takes place.
+  // We will, though, verify that. And if we are actually still handling a stale request -
+  // both our internal state and the FSM state will be cleared.
+  replica_handling_done();
+  m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+  if (received_token == 0 || received_token == m_current_token) {
+    return true;
+  }
+  dout(5) << __func__ << " obsolete token (" << received_token
+          << " vs current " << m_current_token << dendl;
+
+  return false;
+}
+
 const OSDMapRef& PgScrubber::get_osdmap() const
 {
   return m_pg->get_osdmap();
index 11d08179b53ed52126e3fa3f161823d8db1fa16a..e12f6f0dcc9b50e687fc39b2a46c6744d92c2905 100644 (file)
@@ -208,9 +208,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 
   void send_replica_maps_ready(epoch_t epoch_queued) final;
 
-  void send_start_replica(epoch_t epoch_queued) final;
+  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
 
-  void send_sched_replica(epoch_t epoch_queued) final;
+  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
 
   void send_replica_pushes_upd(epoch_t epoch_queued) final;
   /**
@@ -429,6 +429,14 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  private:
   void reset_internal_state();
 
+  /**
+   *  the current scrubbing operation is done. We should mark that fact, so that
+   *  all events related to the previous operation can be discarded.
+   */
+  void advance_token();
+
+  bool is_token_current(Scrub::act_token_t received_token);
+
   void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
 
   void _scan_snaps(ScrubMap& smap);
@@ -548,6 +556,16 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    *  discarded.
    */
   epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
+
+  /**
+   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
+   *  Primary releases the replica scrub resources.
+   *  When the scrub session is terminated (even if the interval remains unchanged, as
+   *  might happen following an asok no-scrub command), stale scrub-resched messages
+   *  triggered by the backend will be discarded.
+   */
+  Scrub::act_token_t m_current_token{1};
+
   scrub_flags_t m_flags;
 
   bool m_active{false};
index 264d8bc1edda0ac0a512f31abc85827384de8d37..27db1dfa37be8ba1bdab5891a95dfd0f326f2cb9 100644 (file)
@@ -159,7 +159,7 @@ void PGScrubMapsCompared::run(OSD* osd,
 
 void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
 {
-  pg->replica_scrub(epoch_queued, handle);
+  pg->replica_scrub(epoch_queued, activation_index, handle);
   pg->unlock();
 }
 
@@ -168,7 +168,7 @@ void PGRepScrubResched::run(OSD* osd,
                            PGRef& pg,
                            ThreadPool::TPHandle& handle)
 {
-  pg->replica_scrub_resched(epoch_queued, handle);
+  pg->replica_scrub_resched(epoch_queued, activation_index, handle);
   pg->unlock();
 }
 
index 52f48528dfb6df92c0bde5371aa29ecf43951eca..7ba59838e94814b1c97bb3c785615efb59a2a478 100644 (file)
@@ -328,15 +328,29 @@ public:
 class PGScrubItem : public PGOpQueueable {
  protected:
   epoch_t epoch_queued;
+  Scrub::act_token_t activation_index;
   std::string_view message_name;
   PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name)
-      : PGOpQueueable{pg}, epoch_queued{epoch_queued}, message_name{derivative_name}
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{0}
+      , message_name{derivative_name}
+  {}
+  PGScrubItem(spg_t pg,
+             epoch_t epoch_queued,
+             Scrub::act_token_t op_index,
+             std::string_view derivative_name)
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{op_index}
+      , message_name{derivative_name}
   {}
   op_type_t get_op_type() const final { return op_type_t::bg_scrub; }
   std::ostream& print(std::ostream& rhs) const final
   {
     return rhs << message_name << "(pgid=" << get_pgid()
-              << "epoch_queued=" << epoch_queued << ")";
+              << "epoch_queued=" << epoch_queued
+              << " scrub-token=" << activation_index << ")";
   }
   void run(OSD* osd,
           OSDShard* sdata,
@@ -454,15 +468,16 @@ class PGScrubMapsCompared : public PGScrubItem {
 
 class PGRepScrub : public PGScrubItem {
  public:
-  PGRepScrub(spg_t pg, epoch_t epoch_queued) : PGScrubItem{pg, epoch_queued, "PGRepScrub"}
+  PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"}
   {}
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
 };
 
 class PGRepScrubResched : public PGScrubItem {
  public:
-  PGRepScrubResched(spg_t pg, epoch_t epoch_queued)
-      : PGScrubItem{pg, epoch_queued, "PGRepScrubResched"}
+  PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"}
   {}
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
 };
index 73efdea82949c5340a8444f4393e63067d1106eb..34c63f5c5eee02e495e3edb8f2da233a6135a3e8 100644 (file)
@@ -17,6 +17,10 @@ namespace Scrub {
 /// high/low OP priority
 enum class scrub_prio_t : bool { low_priority = false, high_priority = true };
 
+/// Identifies a specific scrub activation within an interval,
+/// see ScrubPGgIF::m_current_token
+using act_token_t = uint32_t;
+
 }  // namespace Scrub
 
 
@@ -134,9 +138,9 @@ struct ScrubPgIF {
 
   virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0;
 
-  virtual void send_start_replica(epoch_t epoch_queued) = 0;
+  virtual void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
 
-  virtual void send_sched_replica(epoch_t epoch_queued) = 0;
+  virtual void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
 
   virtual void on_applied_when_primary(const eversion_t &applied_version) = 0;