From c344665ee2dbc275993dd967c16a5c20d8403a40 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Sat, 11 Jun 2022 07:27:47 +0000 Subject: [PATCH] scrub/osd: add clearer reminders that a scrub is blocked Whenever a scrub session is waiting for an excessive length of time for a locked object to be unlocked, the total number of concurrent scrubs in the system is reduced. The existing cluster warning issued on such occurrences is easily overlooked. Here we add a constant reminder each time the OSD tries to schedule scrubs. Signed-off-by: Ronen Friedman --- src/osd/OSD.cc | 12 ++++ src/osd/osd_types.cc | 15 +++-- src/osd/osd_types.h | 3 +- src/osd/scrubber/osd_scrub_sched.cc | 19 ++++++ src/osd/scrubber/osd_scrub_sched.h | 24 +++++++ src/osd/scrubber/pg_scrubber.cc | 93 ++++++++++++++++++++------ src/osd/scrubber/pg_scrubber.h | 3 + src/osd/scrubber/scrub_machine_lstnr.h | 26 ++++++- 8 files changed, 170 insertions(+), 25 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index dc58cb438827d..cc6b761d3da73 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -7569,6 +7569,18 @@ void OSD::sched_scrub() { auto& scrub_scheduler = service.get_scrub_services(); + if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count(); + blocked_pgs > 0) { + // some PGs managed by this OSD were blocked by a locked object during + // scrub. This means we might not have the resources needed to scrub now. + dout(10) + << fmt::format( + "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)", + __func__, + blocked_pgs) + << dendl; + } + // fail fast if no resources are available if (!scrub_scheduler.can_inc_scrubs()) { dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 3c30479492691..6361b80c9474c 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2931,10 +2931,17 @@ void pg_stat_t::dump_brief(Formatter *f) const std::string pg_stat_t::dump_scrub_schedule() const { if (scrub_sched_status.m_is_active) { - return fmt::format( - "{}scrubbing for {}s", - ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""), - scrub_sched_status.m_duration_seconds); + // are we blocked (in fact, stuck) on some locked object? + if (scrub_sched_status.m_sched_status == pg_scrub_sched_status_t::blocked) { + return fmt::format( + "Blocked! locked objects (for {}s)", + scrub_sched_status.m_duration_seconds); + } else { + return fmt::format( + "{}scrubbing for {}s", + ((scrub_sched_status.m_is_deep == scrub_level_t::deep) ? "deep " : ""), + scrub_sched_status.m_duration_seconds); + } } switch (scrub_sched_status.m_sched_status) { case pg_scrub_sched_status_t::unknown: diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 124bb2b22b84d..bfec916ada494 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2170,7 +2170,8 @@ enum class pg_scrub_sched_status_t : uint16_t { not_queued, ///< not in the OSD's scrub queue. Probably not active. active, ///< scrubbing scheduled, ///< scheduled for a scrub at an already determined time - queued ///< queued to be scrubbed + queued, ///< queued to be scrubbed + blocked ///< blocked waiting for objects to be unlocked }; struct pg_scrubbing_status_t { diff --git a/src/osd/scrubber/osd_scrub_sched.cc b/src/osd/scrubber/osd_scrub_sched.cc index 8e6225eb06cc2..b2bd6e5f0f9bc 100644 --- a/src/osd/scrubber/osd_scrub_sched.cc +++ b/src/osd/scrubber/osd_scrub_sched.cc @@ -800,3 +800,22 @@ void ScrubQueue::dump_scrub_reservations(ceph::Formatter* f) const f->dump_int("scrubs_remote", scrubs_remote); f->dump_int("osd_max_scrubs", conf()->osd_max_scrubs); } + +void ScrubQueue::clear_pg_scrub_blocked(spg_t blocked_pg) +{ + dout(5) << fmt::format(": pg {} is unblocked", blocked_pg) << dendl; + --blocked_scrubs_cnt; + ceph_assert(blocked_scrubs_cnt >= 0); +} + +void ScrubQueue::mark_pg_scrub_blocked(spg_t blocked_pg) +{ + dout(5) << fmt::format(": pg {} is blocked on an object", blocked_pg) + << dendl; + ++blocked_scrubs_cnt; +} + +int ScrubQueue::get_blocked_pgs_count() const +{ + return blocked_scrubs_cnt; +} diff --git a/src/osd/scrubber/osd_scrub_sched.h b/src/osd/scrubber/osd_scrub_sched.h index 45b568b12f63b..1034d47893387 100644 --- a/src/osd/scrubber/osd_scrub_sched.h +++ b/src/osd/scrubber/osd_scrub_sched.h @@ -235,6 +235,13 @@ class ScrubQueue { */ std::atomic_bool updated{false}; + /** + * the scrubber is waiting for locked objects to be unlocked. + * Set after a grace period has passed. + */ + bool blocked{false}; + utime_t blocked_since{}; + utime_t penalty_timeout{0, 0}; CephContext* cct; @@ -376,6 +383,11 @@ class ScrubQueue { void dec_scrubs_remote(); void dump_scrub_reservations(ceph::Formatter* f) const; + /// counting the number of PGs stuck while scrubbing, waiting for objects + void mark_pg_scrub_blocked(spg_t blocked_pg); + void clear_pg_scrub_blocked(spg_t blocked_pg); + int get_blocked_pgs_count() const; + /** * Pacing the scrub operation by inserting delays (mostly between chunks) * @@ -460,6 +472,18 @@ class ScrubQueue { int scrubs_local{0}; int scrubs_remote{0}; + /** + * The scrubbing of PGs might be delayed if the scrubbed chunk of objects is + * locked by some other operation. A bug might cause this to be an infinite + * delay. If that happens, the OSDs "scrub resources" (i.e. the + * counters that limit the number of concurrent scrub operations) might + * be exhausted. + * We do issue a cluster-log warning in such occasions, but that message is + * easy to miss. The 'some pg is blocked' global flag is used to note the + * existence of such a situation in the scrub-queue log messages. + */ + std::atomic_int_fast16_t blocked_scrubs_cnt{0}; + std::atomic_bool a_pg_is_reserving{false}; [[nodiscard]] bool scrub_load_below_threshold() const; diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index f424d3e52d60b..fd40808344892 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -538,6 +538,7 @@ void PgScrubber::update_scrub_job(const requested_scrub_t& request_flags) m_pg->info, m_pg->get_pgpool().info.opts); m_osds->get_scrub_services().update_job(m_scrub_job, suggested); + m_pg->publish_stats_to_osd(); } dout(15) << __func__ << " done " << registration_state() << dendl; @@ -566,7 +567,6 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level, << dendl; update_scrub_job(req_flags); - m_pg->publish_stats_to_osd(); } @@ -775,8 +775,10 @@ bool PgScrubber::range_intersects_scrub(const hobject_t& start, Scrub::BlockedRangeWarning PgScrubber::acquire_blocked_alarm() { + ceph::timespan grace_period{m_debug_blockrange?4s:120s}; return std::make_unique(m_osds, - ceph::timespan{300s}, + grace_period, + *this, m_pg_id); } @@ -1730,6 +1732,27 @@ bool PgScrubber::is_queued_or_active() const return m_queued_or_active; } +void PgScrubber::set_scrub_blocked(utime_t since) +{ + ceph_assert(!m_scrub_job->blocked); + // we are called from a time-triggered lambda, + // thus - not under PG-lock + PGRef pg = m_osds->osd->lookup_lock_pg(m_pg_id); + m_osds->get_scrub_services().mark_pg_scrub_blocked(m_pg_id); + m_scrub_job->blocked_since = since; + m_scrub_job->blocked = true; + m_pg->publish_stats_to_osd(); + pg->unlock(); +} + +void PgScrubber::clear_scrub_blocked() +{ + ceph_assert(m_scrub_job->blocked); + m_osds->get_scrub_services().clear_pg_scrub_blocked(m_pg_id); + m_scrub_job->blocked = false; + m_pg->publish_stats_to_osd(); +} + /* * note: only called for the Primary. */ @@ -2029,30 +2052,52 @@ void PgScrubber::dump_active_scrubber(ceph::Formatter* f, bool is_deep) const } f->close_section(); } - f->dump_string("schedule", "scrubbing"); + if (m_scrub_job->blocked) { + f->dump_string("schedule", "blocked"); + } else { + f->dump_string("schedule", "scrubbing"); + } } pg_scrubbing_status_t PgScrubber::get_schedule() const { - dout(25) << __func__ << dendl; - if (!m_scrub_job) { return pg_scrubbing_status_t{}; } + dout(25) << fmt::format("{}: active:{} blocked:{}", + __func__, + m_active, + m_scrub_job->blocked) + << dendl; + auto now_is = ceph_clock_now(); if (m_active) { // report current scrub info, including updated duration - int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec(); - return pg_scrubbing_status_t{ - utime_t{}, - duration, - pg_scrub_sched_status_t::active, - true, // active - (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), - false /* is periodic? unknown, actually */}; + if (m_scrub_job->blocked) { + // a bug. An object is held locked. + int32_t blocked_for = + (utime_t{now_is} - m_scrub_job->blocked_since).sec(); + return pg_scrubbing_status_t{ + utime_t{}, + blocked_for, + pg_scrub_sched_status_t::blocked, + true, // active + (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), + false}; + + } else { + int32_t duration = (utime_t{now_is} - scrub_begin_stamp).sec(); + return pg_scrubbing_status_t{ + utime_t{}, + duration, + pg_scrub_sched_status_t::active, + true, // active + (m_is_deep ? scrub_level_t::deep : scrub_level_t::shallow), + false /* is periodic? unknown, actually */}; + } } if (m_scrub_job->state != ScrubQueue::qu_state_t::registered) { return pg_scrubbing_status_t{utime_t{}, @@ -2665,20 +2710,27 @@ ostream& operator<<(ostream& out, const MapsCollectionStatus& sf) blocked_range_t::blocked_range_t(OSDService* osds, ceph::timespan waittime, + ScrubMachineListener& scrubber, spg_t pg_id) : m_osds{osds} + , m_scrubber{scrubber} + , m_pgid{pg_id} { auto now_is = std::chrono::system_clock::now(); - m_callbk = new LambdaContext([now_is, pg_id, osds]([[maybe_unused]] int r) { + m_callbk = new LambdaContext([this, now_is]([[maybe_unused]] int r) { std::time_t now_c = std::chrono::system_clock::to_time_t(now_is); char buf[50]; strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S", std::localtime(&now_c)); lgeneric_subdout(g_ceph_context, osd, 10) - << "PgScrubber: " << pg_id << " blocked on an object for too long (since " - << buf << ")" << dendl; - osds->clog->warn() << "osd." << osds->whoami << " PgScrubber: " << pg_id - << " blocked on an object for too long (since " << buf - << ")"; + << "PgScrubber: " << m_pgid + << " blocked on an object for too long (since " << buf << ")" << dendl; + m_osds->clog->warn() << "osd." << m_osds->whoami + << " PgScrubber: " << m_pgid + << " blocked on an object for too long (since " << buf + << ")"; + + m_warning_issued = true; + m_scrubber.set_scrub_blocked(utime_t{now_c,0}); return; }); @@ -2688,6 +2740,9 @@ blocked_range_t::blocked_range_t(OSDService* osds, blocked_range_t::~blocked_range_t() { + if (m_warning_issued) { + m_scrubber.clear_scrub_blocked(); + } std::lock_guard l(m_osds->sleep_lock); m_osds->sleep_timer.cancel_event(m_callbk); } diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index fc6efa2dd1c55..06f52e0bc28f6 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -459,6 +459,9 @@ class PgScrubber : public ScrubPgIF, void select_range_n_notify() final; Scrub::BlockedRangeWarning acquire_blocked_alarm() final; + void set_scrub_blocked(utime_t since) final; + void clear_scrub_blocked() final; + /// walk the log to find the latest update that affects our chunk eversion_t search_log_for_updates() const final; diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index f4fc9e4fbe9bb..9135c3257af41 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -9,6 +9,8 @@ #include "include/Context.h" #include "osd/osd_types.h" +struct ScrubMachineListener; + namespace Scrub { enum class PreemptionNoted { no_preemption, preempted }; @@ -46,11 +48,23 @@ struct preemption_t { /// an aux used when blocking on a busy object. /// Issues a log warning if still blocked after 'waittime'. struct blocked_range_t { - blocked_range_t(OSDService* osds, ceph::timespan waittime, spg_t pg_id); + blocked_range_t(OSDService* osds, + ceph::timespan waittime, + ScrubMachineListener& scrubber, + spg_t pg_id); ~blocked_range_t(); OSDService* m_osds; + ScrubMachineListener& m_scrubber; + + /// used to identify ourselves to the PG, when no longer blocked + spg_t m_pgid; Context* m_callbk; + + // once timed-out, we flag the OSD's scrub-queue as having + // a problem. 'm_warning_issued' signals the need to clear + // that OSD-wide flag. + bool m_warning_issued{false}; }; using BlockedRangeWarning = std::unique_ptr; @@ -173,6 +187,16 @@ struct ScrubMachineListener { virtual void set_queued_or_active() = 0; virtual void clear_queued_or_active() = 0; + /** + * Our scrubbing is blocked, waiting for an excessive length of time for + * our target chunk to be unlocked. We will set the corresponding flags, + * both in the OSD_wide scrub-queue object, and in our own scrub-job object. + * Both flags are used to report the unhealthy state in the log and in + * response to scrub-queue queries. + */ + virtual void set_scrub_blocked(utime_t since) = 0; + virtual void clear_scrub_blocked() = 0; + /** * the FSM interface into the "are we waiting for maps, either our own or from * replicas" state. -- 2.39.5