]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd/scrubber: use schedule_timer_event_after for reservation timeout
authorSamuel Just <sjust@redhat.com>
Fri, 17 Feb 2023 04:44:58 +0000 (20:44 -0800)
committerRonen Friedman <rfriedma@redhat.com>
Wed, 26 Mar 2025 15:10:30 +0000 (15:10 +0000)
Previously, ReplicaReservations::no_reply_t was responsible for managing
the timer callback and cancellation.  Move the timeout management to the
ReservingReplicas state using schedule_timer_event_after.

Fixes: https://tracker.ceph.com/issues/58461
Signed-off-by: Samuel Just <sjust@redhat.com>
(cherry picked from commit 525478da3698cfe8a154c642dedd0570f1d52b37)

src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/pg_scrubber.h
src/osd/scrubber/scrub_machine.cc
src/osd/scrubber/scrub_machine.h
src/osd/scrubber/scrub_machine_lstnr.h

index 868003ebdf971122ff25f677c43c8294c846a31b..2cca552107c916d056562e997133d7d8407ce114 100644 (file)
@@ -1836,6 +1836,13 @@ void PgScrubber::unreserve_replicas()
   m_reservations.reset();
 }
 
+void PgScrubber::on_replica_reservation_timeout()
+{
+  if (m_reservations) {
+    m_reservations->handle_no_reply_timeout();
+  }
+}
+
 void PgScrubber::set_reserving_now()
 {
   m_osds->get_scrub_services().set_reserving_now();
@@ -2645,10 +2652,6 @@ ReplicaReservations::ReplicaReservations(
     send_all_done();
 
   } else {
-    // start a timer to handle the case of no replies
-    m_no_reply = make_unique<ReplicaReservations::no_reply_t>(
-      m_osds, m_conf, *this, m_log_msg_prefix);
-
     // send the reservation requests
     for (auto p : m_acting_set) {
       if (p == whoami)
@@ -2666,14 +2669,12 @@ ReplicaReservations::ReplicaReservations(
 void ReplicaReservations::send_all_done()
 {
   // stop any pending timeout timer
-  m_no_reply.reset();
   m_osds->queue_for_scrub_granted(m_pg, scrub_prio_t::low_priority);
 }
 
 void ReplicaReservations::send_reject()
 {
   // stop any pending timeout timer
-  m_no_reply.reset();
   m_scrub_job->resources_failure = true;
   m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority);
 }
@@ -2682,7 +2683,6 @@ void ReplicaReservations::discard_all()
 {
   dout(10) << __func__ << ": " << m_reserved_peers << dendl;
 
-  m_no_reply.reset();
   m_had_rejections = true;  // preventing late-coming responses from triggering
                            // events
   m_reserved_peers.clear();
@@ -2712,9 +2712,6 @@ ReplicaReservations::~ReplicaReservations()
   m_had_rejections = true;  // preventing late-coming responses from triggering
                            // events
 
-  // stop any pending timeout timer
-  m_no_reply.reset();
-
   // send un-reserve messages to all reserved replicas. We do not wait for
   // answer (there wouldn't be one). Other incoming messages will be discarded
   // on the way, by our owner.
@@ -2834,6 +2831,7 @@ void ReplicaReservations::handle_no_reply_timeout()
               "{}: timeout! no reply from {}", __func__, m_waited_for_peers)
          << dendl;
 
+  // treat reply timeout as if a REJECT was received
   m_had_rejections = true;  // preventing any additional notifications
   send_reject();
 }
@@ -2843,39 +2841,6 @@ std::ostream& ReplicaReservations::gen_prefix(std::ostream& out) const
   return out << m_log_msg_prefix;
 }
 
-ReplicaReservations::no_reply_t::no_reply_t(
-  OSDService* osds,
-  const ConfigProxy& conf,
-  ReplicaReservations& parent,
-  std::string_view log_prfx)
-    : m_osds{osds}
-    , m_conf{conf}
-    , m_parent{parent}
-    , m_log_prfx{log_prfx}
-{
-  using namespace std::chrono;
-  auto now_is = clock::now();
-  auto timeout =
-    conf.get_val<std::chrono::milliseconds>("osd_scrub_reservation_timeout");
-
-  m_abort_callback = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
-    // behave as if a REJECT was received
-    m_osds->clog->warn() << fmt::format(
-      "{} timeout on replica reservations (since {})", m_log_prfx, now_is);
-    m_parent.handle_no_reply_timeout();
-  });
-
-  std::lock_guard l(m_osds->sleep_lock);
-  m_osds->sleep_timer.add_event_after(timeout, m_abort_callback);
-}
-
-ReplicaReservations::no_reply_t::~no_reply_t()
-{
-  std::lock_guard l(m_osds->sleep_lock);
-  if (m_abort_callback) {
-    m_osds->sleep_timer.cancel_event(m_abort_callback);
-  }
-}
 
 // ///////////////////// LocalReservation //////////////////////////////////
 
index 0818b57e0ceef4def598a8b6c6c329e16f51cf44..ce9ac9c96f064a696ed2fd22989a3188f958e99e 100644 (file)
@@ -123,22 +123,6 @@ class ReplicaReservations {
   using clock = std::chrono::system_clock;
   using tpoint_t = std::chrono::time_point<clock>;
 
-  /// a no-reply timeout handler
-  struct no_reply_t {
-    explicit no_reply_t(
-      OSDService* osds,
-      const ConfigProxy& conf,
-      ReplicaReservations& parent,
-      std::string_view log_prfx);
-
-    ~no_reply_t();
-    OSDService* m_osds;
-    const ConfigProxy& m_conf;
-    ReplicaReservations& m_parent;
-    std::string m_log_prfx;
-    Context* m_abort_callback{nullptr};
-  };
-
   PG* m_pg;
   std::set<pg_shard_t> m_acting_set;
   OSDService* m_osds;
@@ -154,9 +138,6 @@ class ReplicaReservations {
   std::chrono::milliseconds m_timeout;
   std::optional<tpoint_t> m_timeout_point;
 
-  // detecting & handling a "no show" of a replica
-  std::unique_ptr<no_reply_t> m_no_reply;
-
   void release_replica(pg_shard_t peer, epoch_t epoch);
 
   void send_all_done();         ///< all reservations are granted
@@ -398,6 +379,8 @@ class PgScrubber : public ScrubPgIF,
   void discard_replica_reservations() final;
   void clear_scrub_reservations() final;  // PG::clear... fwds to here
   void unreserve_replicas() final;
+  void on_replica_reservation_timeout() final;
+
 
   // managing scrub op registration
 
index fff9b0fca91c2741a31ef51b63e795c0da55713c..56b06077999273084b9b06c16cbfb77f3a60ca8f 100644 (file)
@@ -122,6 +122,15 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
   // replicas resources
   scrbr->set_reserving_now();
   scrbr->reserve_replicas();
+
+  auto timeout = scrbr->get_cct()->_conf.get_val<
+    std::chrono::milliseconds>("osd_scrub_reservation_timeout");
+  if (timeout.count() > 0) {
+    // Start a timer to handle case where the replicas take a long time to
+    // ack the reservation.  See ReservationTimeout handler below.
+    m_timeout_token = machine.schedule_timer_event_after<ReservationTimeout>(
+      timeout);
+  }
 }
 
 ReservingReplicas::~ReservingReplicas()
@@ -130,6 +139,25 @@ ReservingReplicas::~ReservingReplicas()
   scrbr->clear_reserving_now();
 }
 
+sc::result ReservingReplicas::react(const ReservationTimeout&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReservingReplicas::react(const ReservationTimeout&)" << dendl;
+
+  dout(10)
+    << "PgScrubber: " << scrbr->get_spgid()
+    << " timeout on reserving replicas (since " << entered_at
+    << ")" << dendl;
+  scrbr->get_clog()->warn()
+    << "osd." << scrbr->get_whoami()
+    << " PgScrubber: " << scrbr->get_spgid()
+    << " timeout on reserving replicsa (since " << entered_at
+    << ")";
+
+  scrbr->on_replica_reservation_timeout();
+  return discard_event();
+}
+
 sc::result ReservingReplicas::react(const ReservationFailure&)
 {
   DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
index 01710836a598baa5115944f0e4c2da4cda516513..1d999c2cabec8549289309b435815835c61bd675 100644 (file)
@@ -65,6 +65,9 @@ MEV(RemotesReserved)
 /// a reservation request has failed
 MEV(ReservationFailure)
 
+/// reservations have timed out
+MEV(ReservationTimeout)
+
 /// initiate a new scrubbing session (relevant if we are a Primary)
 MEV(StartScrub)
 
@@ -310,16 +313,22 @@ struct NotActive : sc::state<NotActive, ScrubMachine>, NamedSimply {
 
 struct ReservingReplicas : sc::state<ReservingReplicas, ScrubMachine>,
                           NamedSimply {
-
   explicit ReservingReplicas(my_context ctx);
   ~ReservingReplicas();
   using reactions = mpl::list<sc::custom_reaction<FullReset>,
                              // all replicas granted our resources request
                              sc::transition<RemotesReserved, ActiveScrubbing>,
+                             sc::custom_reaction<ReservationTimeout>,
                              sc::custom_reaction<ReservationFailure>>;
 
+  ceph::coarse_real_clock::time_point entered_at =
+    ceph::coarse_real_clock::now();
+  ScrubMachine::timer_event_token_t m_timeout_token;
+
   sc::result react(const FullReset&);
 
+  sc::result react(const ReservationTimeout&);
+
   /// at least one replica denied us the scrub resources we've requested
   sc::result react(const ReservationFailure&);
 };
index fb31ba2e2f5e214e0b98b19e32a1c09e2a07b678..43154f7b61e28ee457ccc597357091a3dd5cb6d1 100644 (file)
@@ -179,6 +179,8 @@ struct ScrubMachineListener {
 
   virtual void unreserve_replicas() = 0;
 
+  virtual void on_replica_reservation_timeout() = 0;
+
   virtual void set_scrub_begin_time() = 0;
 
   virtual void set_scrub_duration() = 0;