]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd/scrub: tag replica scrub messages to identify stale events
authorRonen Friedman <rfriedma@redhat.com>
Thu, 5 Aug 2021 07:53:11 +0000 (07:53 +0000)
committerRonen Friedman <rfriedma@redhat.com>
Fri, 20 Aug 2021 14:33:21 +0000 (14:33 +0000)
A lot can happen while a replica is waiting for the backend
to collect the map data. The Primary might, for example, abort
the scrub and start a new one (following no-scrub commands).

The 'token' introduced here tags 'replica scrub resched'
messages with an index value that is modified on each 'release
scrub resources' request from the Primary.

Fixes: https://tracker.ceph.com/issues/52012
Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/pg_scrubber.cc
src/osd/pg_scrubber.h
src/osd/scheduler/OpSchedulerItem.cc
src/osd/scheduler/OpSchedulerItem.h
src/osd/scrubber_common.h

index b0518b6b9130ba09f94d5b6adc6d9647b70a3bd3..615dbc0fd1cb884824d0b47dc3bb7ee94047e0c4 100644 (file)
@@ -1747,11 +1747,13 @@ void OSDService::queue_for_snap_trim(PG *pg)
 template <class MSG_TYPE>
 void OSDService::queue_scrub_event_msg(PG* pg,
                                       Scrub::scrub_prio_t with_priority,
-                                      unsigned int qu_priority)
+                                      unsigned int qu_priority,
+                                      Scrub::act_token_t act_token)
 {
   const auto epoch = pg->get_osdmap_epoch();
-  auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
-  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
+  auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
+  dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
+           << ". Epoch: " << epoch << " token: " << act_token << dendl;
 
   enqueue_back(OpSchedulerItem(
     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
@@ -1759,7 +1761,8 @@ void OSDService::queue_scrub_event_msg(PG* pg,
 }
 
 template <class MSG_TYPE>
-void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
+void OSDService::queue_scrub_event_msg(PG* pg,
+                                       Scrub::scrub_prio_t with_priority)
 {
   const auto epoch = pg->get_osdmap_epoch();
   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
@@ -1782,17 +1785,20 @@ void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_prior
 
 void OSDService::queue_for_rep_scrub(PG* pg,
                                     Scrub::scrub_prio_t with_priority,
-                                    unsigned int qu_priority)
+                                    unsigned int qu_priority,
+                                    Scrub::act_token_t act_token)
 {
-  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
+  queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
 }
 
 void OSDService::queue_for_rep_scrub_resched(PG* pg,
                                             Scrub::scrub_prio_t with_priority,
-                                            unsigned int qu_priority)
+                                            unsigned int qu_priority,
+                                            Scrub::act_token_t act_token)
 {
   // Resulting scrub event: 'SchedReplica'
-  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
+  queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
+                                          act_token);
 }
 
 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
index 2a30d7afadb0df53fae5e9baf621de1aa6d0cdde..53e2433a0334b25eccea01bd4d3cc2ec7f1a90b0 100644 (file)
@@ -638,16 +638,19 @@ public:
 
   void queue_for_rep_scrub(PG* pg,
                           Scrub::scrub_prio_t with_high_priority,
-                          unsigned int qu_priority);
+                          unsigned int qu_priority,
+                          Scrub::act_token_t act_token);
 
   /// Signals a change in the number of in-flight recovery writes
   void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority);
 
-  /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to trigger
-  /// a re-check of the availability of the scrub map prepared by the backend.
+  /// (not in Crimson) Queue a SchedReplica event to be sent to the replica, to
+  /// trigger a re-check of the availability of the scrub map prepared by the
+  /// backend.
   void queue_for_rep_scrub_resched(PG* pg,
                                   Scrub::scrub_prio_t with_high_priority,
-                                  unsigned int qu_priority);
+                                  unsigned int qu_priority,
+                                  Scrub::act_token_t act_token);
 
   void queue_for_pg_delete(spg_t pgid, epoch_t e);
   bool try_finish_pg_delete(PG *pg, unsigned old_pg_num);
@@ -661,7 +664,8 @@ private:
   template <class MSG_TYPE>
   void queue_scrub_event_msg(PG* pg,
                             Scrub::scrub_prio_t with_priority,
-                            unsigned int qu_priority);
+                            unsigned int qu_priority,
+                            Scrub::act_token_t act_token);
 
   /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is
   /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags)
index 7991b3d92947687211eaa5026b7e83138226953e..40d58561c35a202a028c06f14fc47404b33ffcf6 100644 (file)
@@ -2079,6 +2079,23 @@ void PG::forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view
   }
 }
 
+void PG::forward_scrub_event(ScrubSafeAPI fn,
+                            epoch_t epoch_queued,
+                            Scrub::act_token_t act_token,
+                            std::string_view desc)
+{
+  dout(20) << __func__ << ": " << desc << " queued: " << epoch_queued
+          << " token: " << act_token << dendl;
+  if (is_active() && m_scrubber) {
+    ((*m_scrubber).*fn)(epoch_queued, act_token);
+  } else {
+    // pg might be in the process of being deleted
+    dout(5) << __func__ << " refusing to forward. "
+           << (is_clean() ? "(clean) " : "(not clean) ")
+           << (is_active() ? "(active) " : "(not active) ") << dendl;
+  }
+}
+
 void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
 {
   dout(10) << __func__ << " (op)" << dendl;
@@ -2087,12 +2104,14 @@ void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle)
 }
 
 void PG::replica_scrub(epoch_t epoch_queued,
+                      Scrub::act_token_t act_token,
                       [[maybe_unused]] ThreadPool::TPHandle& handle)
 {
   dout(10) << __func__ << " queued at: " << epoch_queued
           << (is_primary() ? " (primary)" : " (replica)") << dendl;
   scrub_queued = false;
-  forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, "StartReplica/nw");
+  forward_scrub_event(&ScrubPgIF::send_start_replica, epoch_queued, act_token,
+                     "StartReplica/nw");
 }
 
 bool PG::ops_blocked_by_scrub() const
index c288ff265e76e09ce9cc2e9812d9262aa540d476..4f765d9f2d8a5f2db3634668c909d26f7b974f0e 100644 (file)
@@ -394,12 +394,17 @@ public:
                        "AfterRepairScrub");
   }
 
-  void replica_scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void replica_scrub(epoch_t queued,
+                    Scrub::act_token_t act_token,
+                    ThreadPool::TPHandle& handle);
 
-  void replica_scrub_resched(epoch_t queued, ThreadPool::TPHandle& handle)
+  void replica_scrub_resched(epoch_t queued,
+                            Scrub::act_token_t act_token,
+                            ThreadPool::TPHandle& handle)
   {
     scrub_queued = false;
-    forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, "SchedReplica");
+    forward_scrub_event(&ScrubPgIF::send_sched_replica, queued, act_token,
+                       "SchedReplica");
   }
 
   void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle& handle)
@@ -651,8 +656,14 @@ private:
                                  requested_scrub_t& planned) const;
 
   using ScrubAPI = void (ScrubPgIF::*)(epoch_t epoch_queued);
-
   void forward_scrub_event(ScrubAPI fn, epoch_t epoch_queued, std::string_view desc);
+  // and for events that carry a meaningful 'activation token'
+  using ScrubSafeAPI = void (ScrubPgIF::*)(epoch_t epoch_queued,
+                                          Scrub::act_token_t act_token);
+  void forward_scrub_event(ScrubSafeAPI fn,
+                          epoch_t epoch_queued,
+                          Scrub::act_token_t act_token,
+                          std::string_view desc);
 
 public:
   virtual void do_request(
index 1ad3775853979a36a330593a3cb56baa3c4bff67..51fe8d960e622f8f902df10635f757f8ecb13969 100644 (file)
@@ -221,15 +221,17 @@ void PgScrubber::send_scrub_resched(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
-void PgScrubber::send_start_replica(epoch_t epoch_queued)
+void PgScrubber::send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token)
 {
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
   if (is_primary()) {
     // shouldn't happen. Ignore
     dout(1) << "got a replica scrub request while Primary!" << dendl;
     return;
   }
-  if (check_interval(epoch_queued)) {
+
+  if (check_interval(epoch_queued) && is_token_current(token)) {
     m_fsm->my_states();
     // save us some time by not waiting for updates if there are none
     // to wait for. Affects the transition from NotActive into either
@@ -242,10 +244,11 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued)
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
 
-void PgScrubber::send_sched_replica(epoch_t epoch_queued)
+void PgScrubber::send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token)
 {
-  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl;
-  if (check_interval(epoch_queued)) {
+  dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued
+          << " token: " << token << dendl;
+  if (check_interval(epoch_queued) && is_token_current(token)) {
     m_fsm->my_states();
     m_fsm->process_event(SchedReplica{});  // retest for map availability
   }
@@ -491,9 +494,8 @@ void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags)
 
 void PgScrubber::unreg_next_scrub()
 {
-  dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? "
-          << is_scrub_registered() << dendl;
   if (is_scrub_registered()) {
+    dout(15) << __func__ << " existing-" << m_scrub_reg_stamp << dendl;
     m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp);
     m_scrub_reg_stamp = utime_t{};
   }
@@ -1045,7 +1047,8 @@ int PgScrubber::build_primary_map_chunk()
 int PgScrubber::build_replica_map_chunk()
 {
   dout(10) << __func__ << " interval start: " << m_interval_start
-          << " epoch: " << m_epoch_start << " deep: " << m_is_deep << dendl;
+          << " current token: " << m_current_token << " epoch: " << m_epoch_start
+          << " deep: " << m_is_deep << dendl;
 
   auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end,
                                   m_is_deep);
@@ -1057,7 +1060,7 @@ int PgScrubber::build_replica_map_chunk()
       // (note: previous version used low priority here. Now switched to using the
       // priority of the original message)
       m_osds->queue_for_rep_scrub_resched(m_pg, m_replica_request_priority,
-                                         m_flags.priority);
+                                         m_flags.priority, m_current_token);
       break;
 
     case 0: {
@@ -1250,7 +1253,8 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   // make sure the FSM is at NotActive
   m_fsm->assert_not_active();
 
-  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority);
+  m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority,
+                             m_current_token);
 }
 
 void PgScrubber::set_op_parameters(requested_scrub_t& request)
@@ -1482,7 +1486,9 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
    *  otherwise the interval would have changed.
    *  Ostensibly we can discard & redo the reservation. But then we
    *  will be temporarily releasing the OSD resource - and might not be able to grab it
-   *  again. Thus, we simply treat this as a successful new request.
+   *  again. Thus, we simply treat this as a successful new request
+   *  (but mark the fact that if there is a previous request from the primary to
+   *  scrub a specific chunk - that request is now defunct).
    */
 
   if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) {
@@ -1500,6 +1506,15 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
   if (m_remote_osd_resource.has_value()) {
 
     dout(10) << __func__ << " already reserved." << dendl;
+
+    /*
+     * it might well be that we did not yet finish handling the latest scrub-op from
+     * our primary. This happens, for example, if 'noscrub' was set via a command, then
+     * reset. The primary in this scenario will remain in the same interval, but we do need
+     * to reset our internal state (otherwise - the first renewed 'give me your scrub map'
+     * from the primary will see us in active state, crashing the OSD).
+     */
+    advance_token();
     granted = true;
 
   } else if (m_pg->cct->_conf->osd_scrub_during_recovery ||
@@ -1551,6 +1566,12 @@ void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
 {
   dout(10) << __func__ << " " << *op->get_req() << dendl;
   op->mark_started();
+
+  /*
+   * this specific scrub session has terminated. All incoming events carrying the old
+   * tag will be discarded.
+   */
+  advance_token();
   m_remote_osd_resource.reset();
 }
 
@@ -2022,6 +2043,30 @@ void PgScrubber::reset_internal_state()
   m_active = false;
 }
 
+// note that only applicable to the Replica:
+void PgScrubber::advance_token()
+{
+  dout(10) << __func__ << " was: " << m_current_token << dendl;
+  m_current_token++;
+
+  // when advance_token() is called, it is assumed that no scrubbing takes place.
+  // We will, though, verify that. And if we are actually still handling a stale request -
+  // both our internal state and the FSM state will be cleared.
+  replica_handling_done();
+  m_fsm->process_event(FullReset{});
+}
+
+bool PgScrubber::is_token_current(Scrub::act_token_t received_token)
+{
+  if (received_token == 0 || received_token == m_current_token) {
+    return true;
+  }
+  dout(5) << __func__ << " obsolete token (" << received_token
+          << " vs current " << m_current_token << dendl;
+
+  return false;
+}
+
 const OSDMapRef& PgScrubber::get_osdmap() const
 {
   return m_pg->get_osdmap();
index aba2450c7b9944f2cdccb51ae8e85f6b8e9b0d85..89d602b7fe2ca2fc2dc5d3467f8647d24cbc55db 100644 (file)
@@ -209,9 +209,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 
   void send_replica_maps_ready(epoch_t epoch_queued) final;
 
-  void send_start_replica(epoch_t epoch_queued) final;
+  void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
 
-  void send_sched_replica(epoch_t epoch_queued) final;
+  void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) final;
 
   void send_replica_pushes_upd(epoch_t epoch_queued) final;
   /**
@@ -438,6 +438,14 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
  private:
   void reset_internal_state();
 
+  /**
+   *  the current scrubbing operation is done. We should mark that fact, so that
+   *  all events related to the previous operation can be discarded.
+   */
+  void advance_token();
+
+  bool is_token_current(Scrub::act_token_t received_token);
+
   void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
 
   void _scan_snaps(ScrubMap& smap);
@@ -557,6 +565,16 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
    *  discarded.
    */
   epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
+
+  /**
+   *  (replica) a tag identifying a specific scrub "session". Incremented whenever the
+   *  Primary releases the replica scrub resources.
+   *  When the scrub session is terminated (even if the interval remains unchanged, as
+   *  might happen following an asok no-scrub command), stale scrub-resched messages
+   *  triggered by the backend will be discarded.
+   */
+  Scrub::act_token_t m_current_token{1};
+
   scrub_flags_t m_flags;
 
   bool m_active{false};
index 264d8bc1edda0ac0a512f31abc85827384de8d37..27db1dfa37be8ba1bdab5891a95dfd0f326f2cb9 100644 (file)
@@ -159,7 +159,7 @@ void PGScrubMapsCompared::run(OSD* osd,
 
 void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle)
 {
-  pg->replica_scrub(epoch_queued, handle);
+  pg->replica_scrub(epoch_queued, activation_index, handle);
   pg->unlock();
 }
 
@@ -168,7 +168,7 @@ void PGRepScrubResched::run(OSD* osd,
                            PGRef& pg,
                            ThreadPool::TPHandle& handle)
 {
-  pg->replica_scrub_resched(epoch_queued, handle);
+  pg->replica_scrub_resched(epoch_queued, activation_index, handle);
   pg->unlock();
 }
 
index 52f48528dfb6df92c0bde5371aa29ecf43951eca..7ba59838e94814b1c97bb3c785615efb59a2a478 100644 (file)
@@ -328,15 +328,29 @@ public:
 class PGScrubItem : public PGOpQueueable {
  protected:
   epoch_t epoch_queued;
+  Scrub::act_token_t activation_index;
   std::string_view message_name;
   PGScrubItem(spg_t pg, epoch_t epoch_queued, std::string_view derivative_name)
-      : PGOpQueueable{pg}, epoch_queued{epoch_queued}, message_name{derivative_name}
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{0}
+      , message_name{derivative_name}
+  {}
+  PGScrubItem(spg_t pg,
+             epoch_t epoch_queued,
+             Scrub::act_token_t op_index,
+             std::string_view derivative_name)
+      : PGOpQueueable{pg}
+      , epoch_queued{epoch_queued}
+      , activation_index{op_index}
+      , message_name{derivative_name}
   {}
   op_type_t get_op_type() const final { return op_type_t::bg_scrub; }
   std::ostream& print(std::ostream& rhs) const final
   {
     return rhs << message_name << "(pgid=" << get_pgid()
-              << "epoch_queued=" << epoch_queued << ")";
+              << "epoch_queued=" << epoch_queued
+              << " scrub-token=" << activation_index << ")";
   }
   void run(OSD* osd,
           OSDShard* sdata,
@@ -454,15 +468,16 @@ class PGScrubMapsCompared : public PGScrubItem {
 
 class PGRepScrub : public PGScrubItem {
  public:
-  PGRepScrub(spg_t pg, epoch_t epoch_queued) : PGScrubItem{pg, epoch_queued, "PGRepScrub"}
+  PGRepScrub(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrub"}
   {}
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
 };
 
 class PGRepScrubResched : public PGScrubItem {
  public:
-  PGRepScrubResched(spg_t pg, epoch_t epoch_queued)
-      : PGScrubItem{pg, epoch_queued, "PGRepScrubResched"}
+  PGRepScrubResched(spg_t pg, epoch_t epoch_queued, Scrub::act_token_t op_token)
+      : PGScrubItem{pg, epoch_queued, op_token, "PGRepScrubResched"}
   {}
   void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final;
 };
index 41a85b6623958b0015766436e455b68937b67bb9..668ae282fd4eabb49029c867ad6115211ffd9083 100644 (file)
@@ -17,6 +17,10 @@ namespace Scrub {
 /// high/low OP priority
 enum class scrub_prio_t : bool { low_priority = false, high_priority = true };
 
+/// Identifies a specific scrub activation within an interval,
+/// see ScrubPGgIF::m_current_token
+using act_token_t = uint32_t;
+
 }  // namespace Scrub
 
 
@@ -134,9 +138,9 @@ struct ScrubPgIF {
 
   virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0;
 
-  virtual void send_start_replica(epoch_t epoch_queued) = 0;
+  virtual void send_start_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
 
-  virtual void send_sched_replica(epoch_t epoch_queued) = 0;
+  virtual void send_sched_replica(epoch_t epoch_queued, Scrub::act_token_t token) = 0;
 
   virtual void send_full_reset(epoch_t epoch_queued) = 0;