]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
scrub/osd: add clearer reminders that a scrub is blocked
authorRonen Friedman <rfriedma@redhat.com>
Sat, 11 Jun 2022 07:27:47 +0000 (07:27 +0000)
committerRonen Friedman <rfriedma@redhat.com>
Thu, 16 Jun 2022 12:29:28 +0000 (12:29 +0000)
Whenever a scrub session is waiting for an excessive length
of time for a locked object to be unlocked, the total
number of concurrent scrubs in the system is reduced.

The existing cluster warning issued on such occurrences is
easily overlooked. Here we add a constant reminder each time
the OSD tries to schedule scrubs.

Signed-off-by: Ronen Friedman <rfriedma@redhat.com>
src/osd/OSD.cc
src/osd/osd_types.cc
src/osd/osd_types.h
src/osd/scrubber/osd_scrub_sched.cc
src/osd/scrubber/osd_scrub_sched.h
src/osd/scrubber/pg_scrubber.cc
src/osd/scrubber/pg_scrubber.h
src/osd/scrubber/scrub_machine_lstnr.h

index dc58cb438827df34872174453556b30b3ce9ec14..cc6b761d3da73b8e0ddf42024738796879f0c12e 100644 (file)
@@ -7569,6 +7569,18 @@ void OSD::sched_scrub()
 {
   auto& scrub_scheduler = service.get_scrub_services();
 
+  if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
+      blocked_pgs > 0) {
+    // some PGs managed by this OSD were blocked by a locked object during
+    // scrub. This means we might not have the resources needed to scrub now.
+    dout(10)
+      << fmt::format(
+          "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
+          __func__,
+          blocked_pgs)
+      << dendl;
+  }
+
   // fail fast if no resources are available
   if (!scrub_scheduler.can_inc_scrubs()) {
     dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
index 3c30479492691339a8bd5ec484453a0a0008a4c8..6361b80c9474c24e33162cc0891f5380de5c3879 100644 (file)
@@ -2931,10 +2931,17 @@ void pg_stat_t::dump_brief(Formatter *f) const
 std::string pg_stat_t::dump_scrub_schedule() const
 {
   if (scrub_sched_status.m_is_active) {
-    return fmt::format(
-      "{}scrubbing for {}s",
-      ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
-      scrub_sched_status.m_duration_seconds);
+    // are we blocked (in fact, stuck) on some locked object?
+    if (scrub_sched_status.m_sched_status == pg_scrub_sched_status_t::blocked) {
+      return fmt::format(
+       "Blocked! locked objects (for {}s)",
+       scrub_sched_status.m_duration_seconds);
+    } else {
+      return fmt::format(
+       "{}scrubbing for {}s",
+       ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""),
+       scrub_sched_status.m_duration_seconds);
+    }
   }
   switch (scrub_sched_status.m_sched_status) {
     case pg_scrub_sched_status_t::unknown:
index 124bb2b22b84da401b32daef1703197cfb4244fe..bfec916ada494910d65286de2ce6352e39859d11 100644 (file)
@@ -2170,7 +2170,8 @@ enum class pg_scrub_sched_status_t : uint16_t {
   not_queued,     ///< not in the OSD's scrub queue. Probably not active.
   active,          ///< scrubbing
   scheduled,      ///< scheduled for a scrub at an already determined time
-  queued          ///< queued to be scrubbed
+  queued,         ///< queued to be scrubbed
+  blocked         ///< blocked waiting for objects to be unlocked
 };
 
 struct pg_scrubbing_status_t {
index 8e6225eb06cc27391e2c97f8b7988de6da1f24eb..b2bd6e5f0f9bc599c2ae4632fe2fe4369623b357 100644 (file)
@@ -800,3 +800,22 @@ void ScrubQueue::dump_scrub_reservations(ceph::Formatter* f) const
   f->dump_int("scrubs_remote", scrubs_remote);
   f->dump_int("osd_max_scrubs", conf()->osd_max_scrubs);
 }
+
+void ScrubQueue::clear_pg_scrub_blocked(spg_t blocked_pg)
+{
+  dout(5) << fmt::format(": pg {} is unblocked", blocked_pg) << dendl;
+  --blocked_scrubs_cnt;
+  ceph_assert(blocked_scrubs_cnt >= 0);
+}
+
+void ScrubQueue::mark_pg_scrub_blocked(spg_t blocked_pg)
+{
+  dout(5) << fmt::format(": pg {} is blocked on an object", blocked_pg)
+         << dendl;
+  ++blocked_scrubs_cnt;
+}
+
+int ScrubQueue::get_blocked_pgs_count() const
+{
+  return blocked_scrubs_cnt;
+}
index 45b568b12f63b29b80202ec4634ba867d2e27b98..1034d478933877aa59147a95bfdd9b3c03e34ee7 100644 (file)
@@ -235,6 +235,13 @@ class ScrubQueue {
      */
     std::atomic_bool updated{false};
 
+    /**
+     * the scrubber is waiting for locked objects to be unlocked.
+     * Set after a grace period has passed.
+     */
+    bool blocked{false};
+    utime_t blocked_since{};
+
     utime_t penalty_timeout{0, 0};
 
     CephContext* cct;
@@ -376,6 +383,11 @@ class ScrubQueue {
   void dec_scrubs_remote();
   void dump_scrub_reservations(ceph::Formatter* f) const;
 
+  /// counting the number of PGs stuck while scrubbing, waiting for objects
+  void mark_pg_scrub_blocked(spg_t blocked_pg);
+  void clear_pg_scrub_blocked(spg_t blocked_pg);
+  int get_blocked_pgs_count() const;
+
   /**
    * Pacing the scrub operation by inserting delays (mostly between chunks)
    *
@@ -460,6 +472,18 @@ class ScrubQueue {
   int scrubs_local{0};
   int scrubs_remote{0};
 
+  /**
+   * The scrubbing of PGs might be delayed if the scrubbed chunk of objects is
+   * locked by some other operation. A bug might cause this to be an infinite
+   * delay. If that happens, the OSDs "scrub resources" (i.e. the
+   * counters that limit the number of concurrent scrub operations) might
+   * be exhausted.
+   * We do issue a cluster-log warning in such occasions, but that message is
+   * easy to miss. The 'some pg is blocked' global flag is used to note the
+   * existence of such a situation in the scrub-queue log messages.
+   */
+  std::atomic_int_fast16_t blocked_scrubs_cnt{0};
+
   std::atomic_bool a_pg_is_reserving{false};
 
   [[nodiscard]] bool scrub_load_below_threshold() const;
index f424d3e52d60bd848d724a37fb2bd2bb92e2f675..fd40808344892145cd1654a44abee742c92c037f 100644 (file)
@@ -538,6 +538,7 @@ void PgScrubber::update_scrub_job(const requested_scrub_t& request_flags)
       m_pg->info,
       m_pg->get_pgpool().info.opts);
     m_osds->get_scrub_services().update_job(m_scrub_job, suggested);
+    m_pg->publish_stats_to_osd();
   }
 
   dout(15) << __func__ << " done " << registration_state() << dendl;
@@ -566,7 +567,6 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level,
           << dendl;
 
   update_scrub_job(req_flags);
-  m_pg->publish_stats_to_osd();
 }
 
 
@@ -775,8 +775,10 @@ bool PgScrubber::range_intersects_scrub(const hobject_t& start,
 
 Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm()
 {
+  ceph::timespan grace_period{m_debug_blockrange?4s:120s};
   return std::make_unique<blocked_range_t>(m_osds,
-                                          ceph::timespan{300s},
+                                          grace_period,
+                                          *this,
                                           m_pg_id);
 }
 
@@ -1730,6 +1732,27 @@ bool PgScrubber::is_queued_or_active() const
   return m_queued_or_active;
 }
 
+void PgScrubber::set_scrub_blocked(utime_t since)
+{
+  ceph_assert(!m_scrub_job->blocked);
+  // we are called from a time-triggered lambda,
+  // thus - not under PG-lock
+  PGRef pg = m_osds->osd->lookup_lock_pg(m_pg_id);
+  m_osds->get_scrub_services().mark_pg_scrub_blocked(m_pg_id);
+  m_scrub_job->blocked_since = since;
+  m_scrub_job->blocked = true;
+  m_pg->publish_stats_to_osd();
+  pg->unlock();
+}
+
+void PgScrubber::clear_scrub_blocked()
+{
+  ceph_assert(m_scrub_job->blocked);
+  m_osds->get_scrub_services().clear_pg_scrub_blocked(m_pg_id);
+  m_scrub_job->blocked = false;
+  m_pg->publish_stats_to_osd();
+}
+
 /*
  * note: only called for the Primary.
  */
@@ -2029,30 +2052,52 @@ void PgScrubber::dump_active_scrubber(ceph::Formatter* f, bool is_deep) const
     }
     f->close_section();
   }
-  f->dump_string("schedule", "scrubbing");
+  if (m_scrub_job->blocked) {
+    f->dump_string("schedule", "blocked");
+  } else {
+    f->dump_string("schedule", "scrubbing");
+  }
 }
 
 pg_scrubbing_status_t PgScrubber::get_schedule() const
 {
-  dout(25) << __func__ << dendl;
-
   if (!m_scrub_job) {
     return pg_scrubbing_status_t{};
   }
 
+  dout(25) << fmt::format("{}: active:{} blocked:{}",
+                         __func__,
+                         m_active,
+                         m_scrub_job->blocked)
+          << dendl;
+
   auto now_is = ceph_clock_now();
 
   if (m_active) {
     // report current scrub info, including updated duration
-    int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec();
 
-    return pg_scrubbing_status_t{
-      utime_t{},
-      duration,
-      pg_scrub_sched_status_t::active,
-      true,  // active
-      (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
-      false /* is periodic? unknown, actually */};
+    if (m_scrub_job->blocked) {
+      // a bug. An object is held locked.
+      int32_t blocked_for =
+       (utime_t{now_is} - m_scrub_job->blocked_since).sec();
+      return pg_scrubbing_status_t{
+       utime_t{},
+       blocked_for,
+       pg_scrub_sched_status_t::blocked,
+       true,  // active
+       (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
+       false};
+
+    } else {
+      int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec();
+      return pg_scrubbing_status_t{
+       utime_t{},
+       duration,
+       pg_scrub_sched_status_t::active,
+       true,  // active
+       (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow),
+       false /* is periodic? unknown, actually */};
+    }
   }
   if (m_scrub_job->state != ScrubQueue::qu_state_t::registered) {
     return pg_scrubbing_status_t{utime_t{},
@@ -2665,20 +2710,27 @@ ostream& operator<<(ostream& out, const MapsCollectionStatus& sf)
 
 blocked_range_t::blocked_range_t(OSDService* osds,
                                 ceph::timespan waittime,
+                                ScrubMachineListener& scrubber,
                                 spg_t pg_id)
     : m_osds{osds}
+    , m_scrubber{scrubber}
+    , m_pgid{pg_id}
 {
   auto now_is = std::chrono::system_clock::now();
-  m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) {
+  m_callbk = new LambdaContext([this, now_is]([[maybe_unused]] int r) {
     std::time_t now_c = std::chrono::system_clock::to_time_t(now_is);
     char buf[50];
     strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c));
     lgeneric_subdout(g_ceph_context, osd, 10)
-      << "PgScrubber: " << pg_id << " blocked on an object for too long (since "
-      << buf << ")" << dendl;
-    osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id
-                      << " blocked on an object for too long (since " << buf
-                      << ")";
+      << "PgScrubber: " << m_pgid
+      << " blocked on an object for too long (since " << buf << ")" << dendl;
+    m_osds->clog->warn() << "osd." << m_osds->whoami
+                        << " PgScrubber: " << m_pgid
+                        << " blocked on an object for too long (since " << buf
+                        << ")";
+
+    m_warning_issued = true;
+    m_scrubber.set_scrub_blocked(utime_t{now_c,0});
     return;
   });
 
@@ -2688,6 +2740,9 @@ blocked_range_t::blocked_range_t(OSDService* osds,
 
 blocked_range_t::~blocked_range_t()
 {
+  if (m_warning_issued) {
+    m_scrubber.clear_scrub_blocked();
+  }
   std::lock_guard l(m_osds->sleep_lock);
   m_osds->sleep_timer.cancel_event(m_callbk);
 }
index fc6efa2dd1c55583d48dea389a8c2abd644f3133..06f52e0bc28f6af53a93a22dbf20182928fd6d8b 100644 (file)
@@ -459,6 +459,9 @@ class PgScrubber : public ScrubPgIF,
   void select_range_n_notify() final;
 
   Scrub::BlockedRangeWarning acquire_blocked_alarm() final;
+  void set_scrub_blocked(utime_t since) final;
+  void clear_scrub_blocked() final;
+
 
   /// walk the log to find the latest update that affects our chunk
   eversion_t search_log_for_updates() const final;
index f4fc9e4fbe9bb42d55d559eb0acea657ecabd1a9..9135c3257af414596f074a47d0bce35616978daa 100644 (file)
@@ -9,6 +9,8 @@
 #include "include/Context.h"
 #include "osd/osd_types.h"
 
+struct ScrubMachineListener;
+
 namespace Scrub {
 
 enum class PreemptionNoted { no_preemption, preempted };
@@ -46,11 +48,23 @@ struct preemption_t {
 /// an aux used when blocking on a busy object.
 /// Issues a log warning if still blocked after 'waittime'.
 struct blocked_range_t {
-  blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id);
+  blocked_range_t(OSDService* osds,
+                 ceph::timespan waittime,
+                 ScrubMachineListener& scrubber,
+                 spg_t pg_id);
   ~blocked_range_t();
 
   OSDService* m_osds;
+  ScrubMachineListener& m_scrubber;
+
+  /// used to identify ourselves to the PG, when no longer blocked
+  spg_t m_pgid;
   Context* m_callbk;
+
+  // once timed-out, we flag the OSD's scrub-queue as having
+  // a problem. 'm_warning_issued' signals the need to clear
+  // that OSD-wide flag.
+  bool m_warning_issued{false};
 };
 
 using BlockedRangeWarning = std::unique_ptr<blocked_range_t>;
@@ -173,6 +187,16 @@ struct ScrubMachineListener {
   virtual void set_queued_or_active() = 0;
   virtual void clear_queued_or_active() = 0;
 
+  /**
+   * Our scrubbing is blocked, waiting for an excessive length of time for
+   * our target chunk to be unlocked. We will set the corresponding flags,
+   * both in the OSD_wide scrub-queue object, and in our own scrub-job object.
+   * Both flags are used to report the unhealthy state in the log and in
+   * response to scrub-queue queries.
+   */
+  virtual void set_scrub_blocked(utime_t since) = 0;
+  virtual void clear_scrub_blocked() = 0;
+
   /**
    * the FSM interface into the "are we waiting for maps, either our own or from
    * replicas" state.