From: Ronen Friedman Date: Sun, 15 Nov 2020 16:39:33 +0000 (+0200) Subject: osd: extracting scrubbing functionality from 'PG' X-Git-Tag: v16.1.0~270^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=30facb0f2b213f1d8c775dff272e23b31dfa0ee2;p=ceph.git osd: extracting scrubbing functionality from 'PG' into Pa new PgScrubber object. Note that for PrimaryLogPG, a PG derivative, the change will only be completed in the following commits. Signed-off-by: Ronen Friedman --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index bcc2a9de03b..db0d9eb23f8 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1767,21 +1767,29 @@ void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch)); } -void OSDService::queue_for_scrub(PG *pg, bool with_high_priority) +void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority) { - unsigned scrub_queue_priority = pg->scrubber.priority; - if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) { - scrub_queue_priority = cct->_conf->osd_client_op_priority; - } - const auto epoch = pg->get_osdmap_epoch(); - enqueue_back( - OpSchedulerItem( - unique_ptr(new PGScrub(pg->get_pgid(), epoch)), - cct->_conf->osd_scrub_cost, - scrub_queue_priority, - ceph_clock_now(), - 0, - epoch)); + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_for_rep_scrub(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority) +{ + queue_scrub_event_msg(pg, with_priority, qu_priority); +} + +void OSDService::queue_for_rep_scrub_resched(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority) +{ + // Resulting scrub event: 'SchedReplica' + queue_scrub_event_msg(pg, with_priority, qu_priority); } void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority) @@ -1796,6 +1804,46 @@ void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priorit queue_scrub_event_msg(pg, with_priority); } +void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ActivePushesUpd' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'Unblocked' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'DigestUpdate' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'GotReplicas' + queue_scrub_event_msg(pg, with_priority); +} + +void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority) +{ + // Resulting scrub event: 'ReplicaPushesUpd' + queue_scrub_event_msg(pg, with_priority); +} + void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e) { dout(10) << __func__ << " on " << pgid << " e " << e << dendl; @@ -7386,6 +7434,45 @@ bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) return pgid < rhs.pgid; } +// this one is only moved here (from the header) temporarily, for debugging: +void OSDService::unreg_pg_scrub(spg_t pgid, utime_t t) +{ + std::lock_guard l{OSDService::sched_scrub_lock}; + size_t removed = sched_scrub_pg.erase(ScrubJob{cct, pgid, t}); + ceph_assert(removed); + dout(10) << __func__ << " scrub-set removed: " << pgid << " T(" << t << ")" << dendl; +} + +// this one is only moved here (from the header) temporarily, for debugging: +utime_t OSDService::reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval, + double pool_scrub_max_interval, bool must) +{ + ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, + must); + std::lock_guard l(OSDService::sched_scrub_lock); + auto [x, inserted] = sched_scrub_pg.insert(scrub_job); + dout(10) << __func__ << " scrub-set inserted: " << pgid << " T(" << t << ")" << " must: " << must << " inserted " + << inserted << dendl; + return scrub_job.sched_time; +} + +void OSDService::dumps_scrub(ceph::Formatter *f) +{ + ceph_assert(f != nullptr); + std::lock_guard l(sched_scrub_lock); + + f->open_array_section("scrubs"); + for (const auto &i: sched_scrub_pg) { + f->open_object_section("scrub"); + f->dump_stream("pgid") << i.pgid; + f->dump_stream("sched_time") << i.sched_time; + f->dump_stream("deadline") << i.deadline; + f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp()); + f->close_section(); + } + f->close_section(); +} + double OSD::scrub_sleep_time(bool must_scrub) { if (must_scrub) { @@ -7483,14 +7570,17 @@ bool OSD::scrub_load_below_threshold() void OSD::sched_scrub() { + dout(20) << __func__ << " sched_scrub starts" << dendl; + // if not permitted, fail fast if (!service.can_inc_scrubs()) { + dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl; return; } bool allow_requested_repair_only = false; if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) { if (!cct->_conf->osd_repair_during_recovery) { - dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl; + dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl; return; } dout(10) << __func__ @@ -7504,57 +7594,62 @@ void OSD::sched_scrub() bool load_is_low = scrub_load_below_threshold(); dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl; - OSDService::ScrubJob scrub; - if (service.first_scrub_stamp(&scrub)) { + OSDService::ScrubJob scrub_job; + if (service.first_scrub_stamp(&scrub_job)) { do { dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl; - if (scrub.sched_time > now) { + if (scrub_job.sched_time > now) { // save ourselves some effort - dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time + dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time << " > " << now << dendl; break; } - if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) { - dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to " + if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) { + dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to " << (!time_permit ? "time not permit" : "high load") << dendl; continue; } - PGRef pg = _lookup_lock_pg(scrub.pgid); - if (!pg) + PGRef pg = _lookup_lock_pg(scrub_job.pgid); + if (!pg) { + dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl; continue; + } + // This has already started, so go on to the next scrub job - if (pg->scrubber.active) { + if (pg->is_scrub_active()) { pg->unlock(); - dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl; + dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl; continue; } - // Skip other kinds of scrubing if only explicitly requested repairing is allowed - if (allow_requested_repair_only && !pg->scrubber.must_repair) { + // Skip other kinds of scrubbing if only explicitly requested repairing is allowed + if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) { pg->unlock(); - dout(10) << __func__ << " skip " << scrub.pgid + dout(10) << __func__ << " skip " << scrub_job.pgid << " because repairing is not explicitly requested on it" << dendl; continue; } + // If it is reserving, let it resolve before going to the next scrub job - if (pg->scrubber.local_reserved && !pg->scrubber.active) { + if (pg->m_scrubber->is_reserving()) { pg->unlock(); - dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl; + dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl; break; } - dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time + dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time << (pg->get_must_scrub() ? ", explicitly requested" : (load_is_low ? ", load_is_low" : " deadline < now")) << dendl; if (pg->sched_scrub()) { pg->unlock(); + dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl; break; } pg->unlock(); - } while (service.next_scrub_stamp(scrub, &scrub)); + } while (service.next_scrub_stamp(scrub_job, &scrub_job)); } dout(20) << "sched_scrub done" << dendl; } @@ -7562,20 +7657,20 @@ void OSD::sched_scrub() void OSD::resched_all_scrubs() { dout(10) << __func__ << ": start" << dendl; - OSDService::ScrubJob scrub; - if (service.first_scrub_stamp(&scrub)) { + OSDService::ScrubJob scrub_job; + if (service.first_scrub_stamp(&scrub_job)) { do { - dout(20) << __func__ << ": examine " << scrub.pgid << dendl; + dout(20) << __func__ << ": examine " << scrub_job.pgid << dendl; - PGRef pg = _lookup_lock_pg(scrub.pgid); + PGRef pg = _lookup_lock_pg(scrub_job.pgid); if (!pg) continue; - if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) { - dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl; + if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) { + dout(15) << __func__ << ": reschedule " << scrub_job.pgid << dendl; pg->on_info_history_change(); } pg->unlock(); - } while (service.next_scrub_stamp(scrub, &scrub)); + } while (service.next_scrub_stamp(scrub_job, &scrub_job)); } dout(10) << __func__ << ": done" << dendl; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 92bb331365b..aab81286fdb 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -291,20 +291,10 @@ public: }; std::set sched_scrub_pg; - /// @returns the scrub_reg_stamp used for unregister the scrub job + /// @returns the scrub_reg_stamp used for unregister'ing the scrub job utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval, - double pool_scrub_max_interval, bool must) { - ScrubJob scrub(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, - must); - std::lock_guard l(sched_scrub_lock); - sched_scrub_pg.insert(scrub); - return scrub.sched_time; - } - void unreg_pg_scrub(spg_t pgid, utime_t t) { - std::lock_guard l(sched_scrub_lock); - size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t)); - ceph_assert(removed); - } + double pool_scrub_max_interval, bool must); + void unreg_pg_scrub(spg_t pgid, utime_t t); bool first_scrub_stamp(ScrubJob *out) { std::lock_guard l(sched_scrub_lock); if (sched_scrub_pg.empty()) @@ -328,21 +318,7 @@ public: return true; } - void dumps_scrub(ceph::Formatter *f) { - ceph_assert(f != nullptr); - std::lock_guard l(sched_scrub_lock); - - f->open_array_section("scrubs"); - for (const auto &i: sched_scrub_pg) { - f->open_object_section("scrub"); - f->dump_stream("pgid") << i.pgid; - f->dump_stream("sched_time") << i.sched_time; - f->dump_stream("deadline") << i.deadline; - f->dump_bool("forced", i.sched_time == PG::Scrubber::scrub_must_stamp()); - f->close_section(); - } - f->close_section(); - } + void dumps_scrub(ceph::Formatter* f); bool can_inc_scrubs(); bool inc_scrubs_local(); @@ -602,7 +578,8 @@ public: AsyncReserver snap_reserver; void queue_recovery_context(PG *pg, GenContext *c); void queue_for_snap_trim(PG *pg); - void queue_for_scrub(PG *pg, bool with_high_priority); + void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority); + void queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority); /// queue the message (-> event) that all replicas reserved scrub resources for us void queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority); @@ -610,6 +587,36 @@ public: /// queue the message (-> event) that some replicas denied our scrub resources request void queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority); + /// Signals either (a) the end of a sleep period, or (b) a recheck of the availability + /// of the primary map being created by the backend. + void queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals a change in the number of in-flight recovery writes + void queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that all pending updates were applied + void queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// The block-range that was locked and prevented the scrubbing - is freed + void queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that all write OPs are done + void queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority); + + /// Signals that we (the Primary) got all waited-for scrub-maps from our replicas + void queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority); + + void queue_for_rep_scrub(PG* pg, + Scrub::scrub_prio_t with_high_priority, + unsigned int qu_priority); + + /// Signals a change in the number of in-flight recovery writes + void queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority); + + void queue_for_rep_scrub_resched(PG* pg, + Scrub::scrub_prio_t with_high_priority, + unsigned int qu_priority); + void queue_for_pg_delete(spg_t pgid, epoch_t e); bool try_finish_pg_delete(PG *pg, unsigned old_pg_num); @@ -619,12 +626,14 @@ private: std::list > awaiting_throttle; /// queue a scrub-related message for a PG - template - void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority, unsigned int qu_priority); + template + void queue_scrub_event_msg(PG* pg, + Scrub::scrub_prio_t with_priority, + unsigned int qu_priority); /// An alternative version of queue_scrub_event_msg(), in which the queuing priority is /// provided by the executing scrub (i.e. taken from PgScrubber::m_flags) - template + template void queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority); utime_t defer_recovery_until; @@ -1682,6 +1691,7 @@ protected: friend class PG; friend struct OSDShard; friend class PrimaryLogPG; + friend class PgScrubber; protected: diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 7090d14bc20..724d896b727 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -214,7 +214,6 @@ PG::PG(OSDService *o, OSDMapRef curmap, pg_stats_publish_valid(false), finish_sync_event(NULL), scrub_after_recovery(false), - save_req_scrub(false), active_pushes(0), recovery_state( o->cct, @@ -310,7 +309,7 @@ void PG::log_state_exit( osd->pg_recovery_stats.log_exit( state_name, ceph_clock_now() - enter_time, events, event_dur); } - + /********* PG **********/ void PG::remove_snap_mapped_object( @@ -365,29 +364,12 @@ void PG::clear_primary_state() finish_sync_event = 0; // so that _finish_recovery doesn't go off in another thread release_pg_backoffs(); - scrubber.reserved_peers.clear(); + m_scrubber->unreserve_replicas(); scrub_after_recovery = false; - save_req_scrub = false; agent_clear(); } -PG::Scrubber::Scrubber() - : local_reserved(false), remote_reserved(false), reserve_failed(false), - epoch_start(0), - active(false), - shallow_errors(0), deep_errors(0), fixed(0), - must_scrub(false), must_deep_scrub(false), must_repair(false), - need_auto(false), req_scrub(false), time_for_deep(false), - auto_repair(false), - check_repair(false), - deep_scrub_on_error(false), - num_digest_updates_pending(0), - state(INACTIVE), - deep(false) -{} - -PG::Scrubber::~Scrubber() {} bool PG::op_has_sufficient_caps(OpRequestRef& op) { @@ -431,20 +413,6 @@ bool PG::op_has_sufficient_caps(OpRequestRef& op) return cap; } -bool PG::requeue_scrub(bool high_priority) -{ - ceph_assert(ceph_mutex_is_locked(_lock)); - if (scrub_queued) { - dout(10) << __func__ << ": already queued" << dendl; - return false; - } else { - dout(10) << __func__ << ": queueing" << dendl; - scrub_queued = true; - osd->queue_for_scrub(this, high_priority); - return true; - } -} - void PG::queue_recovery() { if (!is_primary() || !is_peered()) { @@ -459,55 +427,36 @@ void PG::queue_recovery() } } -bool PG::queue_scrub() +void PG::queue_scrub_after_repair() { + dout(10) << __func__ << dendl; ceph_assert(ceph_mutex_is_locked(_lock)); + + m_planned_scrub.must_deep_scrub = true; + m_planned_scrub.check_repair = true; + m_planned_scrub.must_scrub = true; + if (is_scrubbing()) { - return false; - } - // An interrupted recovery repair could leave this set. - state_clear(PG_STATE_REPAIR); - if (scrubber.need_auto) { - scrubber.must_scrub = true; - scrubber.must_deep_scrub = true; - scrubber.auto_repair = true; - scrubber.need_auto = false; - } - scrubber.priority = scrubber.must_scrub ? - cct->_conf->osd_requested_scrub_priority : get_scrub_priority(); - scrubber.must_scrub = false; - state_set(PG_STATE_SCRUBBING); - if (scrubber.must_deep_scrub) { - state_set(PG_STATE_DEEP_SCRUB); - scrubber.must_deep_scrub = false; + dout(10) << __func__ << ": scrubbing already" << dendl; + return; } - if (scrubber.must_repair || scrubber.auto_repair) { - state_set(PG_STATE_REPAIR); - scrubber.must_repair = false; + if (scrub_queued) { + dout(10) << __func__ << ": already queued" << dendl; + return; } - requeue_scrub(); - return true; -} -void PG::scrub_send_resources_granted(epoch_t epoch_queued, - [[maybe_unused]] ThreadPool::TPHandle& handle) -{ - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - //m_scrubber->send_remotes_reserved(); -} + m_scrubber->set_op_parameters(m_planned_scrub); + dout(15) << __func__ << ": queueing" << dendl; -void PG::scrub_send_resources_denied(epoch_t epoch_queued, - [[maybe_unused]] ThreadPool::TPHandle& handle) -{ - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - //m_scrubber->send_reservation_failure(); + scrub_queued = true; + osd->queue_scrub_after_repair(this, Scrub::scrub_prio_t::high_priority); } unsigned PG::get_scrub_priority() { // a higher value -> a higher priority - int64_t pool_scrub_priority = 0; - pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority); + int64_t pool_scrub_priority = + pool.info.opts.value_or(pool_opts_t::SCRUB_PRIORITY, (int64_t)0); return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority; } @@ -525,8 +474,11 @@ Context *PG::finish_recovery() return finish_sync_event; } -void PG::_finish_recovery(Context *c) +void PG::_finish_recovery(Context* c) { + dout(15) << __func__ << " finish_sync_event? " << finish_sync_event << " clean? " + << is_clean() << dendl; + std::scoped_lock locker{*this}; if (recovery_state.is_deleting() || !is_clean()) { dout(10) << __func__ << " raced with delete or repair" << dendl; @@ -535,7 +487,7 @@ void PG::_finish_recovery(Context *c) // When recovery is initiated by a repair, that flag is left on state_clear(PG_STATE_REPAIR); if (c == finish_sync_event) { - dout(10) << "_finish_recovery" << dendl; + dout(15) << __func__ << " scrub_after_recovery? " << scrub_after_recovery << dendl; finish_sync_event = 0; recovery_state.purge_strays(); @@ -544,11 +496,7 @@ void PG::_finish_recovery(Context *c) if (scrub_after_recovery) { dout(10) << "_finish_recovery requeueing for scrub" << dendl; scrub_after_recovery = false; - scrubber.must_deep_scrub = true; - scrubber.check_repair = true; - // We remember whether req_scrub was set when scrub_after_recovery set to true - scrubber.req_scrub = save_req_scrub; - queue_scrub(); + queue_scrub_after_repair(); } } else { dout(10) << "_finish_recovery -- stale" << dendl; @@ -1359,243 +1307,247 @@ void PG::requeue_map_waiters() } } +bool PG::get_must_scrub() const +{ + dout(20) << __func__ << " must_scrub? " << (m_planned_scrub.must_scrub ? "true" : "false") << dendl; + return m_planned_scrub.must_scrub; +} unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const { - return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority); + return m_scrubber->scrub_requeue_priority(with_priority); } unsigned int PG::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const { - return 0; // next commit: m_scrubber->scrub_requeue_priority(with_priority, suggested_priority); + return m_scrubber->scrub_requeue_priority(with_priority, suggested_priority); } // ========================================================================================== // SCRUB /* - * when holding pg and sched_scrub_lock, then the states are: - * scheduling: - * scrubber.local_reserved = true - * scrubber.active = false - * scrubber.reserved_peers includes whoami - * osd->scrubs_local++ - * scheduling, replica declined: - * scrubber.local_reserved = true - * scrubber.reserved_peers includes -1 - * osd->scrub_local++ - * pending: - * scrubber.local_reserved = true - * scrubber.active = false - * scrubber.reserved_peers.size() == acting.size(); - * pg on scrub_wq - * osd->scrub_local++ - * scrubbing: - * scrubber.local_reserved = true; - * scrubber.active = true - * scrubber.reserved_peers empty + * implementation note: + * PG::sched_scrub() is called only once per a specific scrub session. + * That call commits us to the whatever choices are made (deep/shallow, etc'). + * Unless failing to start scrubbing, the 'planned scrub' flag-set is 'frozen' into + * PgScrubber's m_flags, then cleared. */ - -// returns true if a scrub has been newly kicked off bool PG::sched_scrub() { + dout(15) << __func__ << " pg(" << info.pgid + << (is_active() ? ") " : ") ") + << (is_clean() ? " " : " ") << dendl; ceph_assert(ceph_mutex_is_locked(_lock)); ceph_assert(!is_scrubbing()); - if (!(is_primary() && is_active() && is_clean())) { + + if (!is_primary() || !is_active() || !is_clean()) { return false; } - // All processing the first time through commits us to whatever - // choices are made. - if (!scrubber.local_reserved) { - dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl; - - bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || - pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)); - bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || - pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)); - bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0); - bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair - && get_pgbackend()->auto_repair_supported()); - - scrubber.time_for_deep = false; - // Clear these in case user issues the scrub/repair command during - // the scheduling of the scrub/repair (e.g. request reservation) - scrubber.deep_scrub_on_error = false; - scrubber.auto_repair = false; + if (scrub_queued) { + // only applicable to the very first time a scrub event is queued + // (until handled and posted to the scrub FSM) + dout(10) << __func__ << ": already queued" << dendl; + return false; + } - // All periodic scrub handling goes here because must_scrub is - // always set for must_deep_scrub and must_repair. - if (!scrubber.must_scrub) { - ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair); - // Handle deep scrub determination only if allowed - if (allow_deep_scrub) { - // Initial entry and scheduled scrubs without nodeep_scrub set get here - if (scrubber.need_auto) { - dout(20) << __func__ << ": need repair after scrub errors" << dendl; - scrubber.time_for_deep = true; - } else { - double deep_scrub_interval = 0; - pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval); - if (deep_scrub_interval <= 0) { - deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; - } - scrubber.time_for_deep = ceph_clock_now() >= - info.history.last_deep_scrub_stamp + deep_scrub_interval; - - bool deep_coin_flip = false; - // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees - // we will deep scrub because this function is called often. - if (!scrubber.time_for_deep && allow_scrub) - deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100; - dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl; - - scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip); - } + // analyse the combination of the requested scrub flags, the osd/pool configuration + // and the PG status to determine whether we should scrub now, and what type of scrub + // should that be. + auto updated_flags = verify_scrub_mode(); + if (!updated_flags) { + // the stars do not align for starting a scrub for this PG at this time + // (due to configuration or priority issues) + // The reason was already reported by the callee. + dout(10) << __func__ << ": failed to initiate a scrub" << dendl; + return false; + } - if (!scrubber.time_for_deep && has_deep_errors) { - osd->clog->info() << "osd." << osd->whoami - << " pg " << info.pgid - << " Deep scrub errors, upgrading scrub to deep-scrub"; - scrubber.time_for_deep = true; - } + // try to reserve the local OSD resources. If failing: no harm. We will + // be retried by the OSD later on. + if (!m_scrubber->reserve_local()) { + dout(10) << __func__ << ": failed to reserve locally" << dendl; + return false; + } - if (try_to_auto_repair) { - if (scrubber.time_for_deep) { - dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; - scrubber.auto_repair = true; - } else if (allow_scrub) { - dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl; - scrubber.deep_scrub_on_error = true; - } - } - } else { // !allow_deep_scrub - dout(20) << __func__ << ": nodeep_scrub set" << dendl; - if (has_deep_errors) { - osd->clog->error() << "osd." << osd->whoami - << " pg " << info.pgid - << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set"; - return false; - } - } + // can commit to the updated flags now, as nothing will stop the scrub + m_planned_scrub = *updated_flags; - //NOSCRUB so skip regular scrubs - if (!allow_scrub && !scrubber.time_for_deep) { - return false; - } - // scrubber.must_scrub - } else if (!scrubber.must_deep_scrub && has_deep_errors) { - osd->clog->error() << "osd." << osd->whoami - << " pg " << info.pgid - << " Regular scrub request, deep-scrub details will be lost"; - } - // Unless precluded this was handle above - scrubber.need_auto = false; - - ceph_assert(scrubber.reserved_peers.empty()); - bool allow_scrubing = cct->_conf->osd_scrub_during_recovery || - (cct->_conf->osd_repair_during_recovery && scrubber.must_repair) || - !osd->is_recovery_active(); - if (allow_scrubing && - osd->inc_scrubs_local()) { - dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl; - scrubber.local_reserved = true; - scrubber.reserved_peers.insert(pg_whoami); - scrub_reserve_replicas(); - } else { - dout(20) << __func__ << ": failed to reserve locally" << dendl; - return false; - } + // An interrupted recovery repair could leave this set. + state_clear(PG_STATE_REPAIR); + + // Pass control to the scrubber. It is the scrubber that handles the replicas' + // resources reservations. + m_scrubber->set_op_parameters(m_planned_scrub); + + dout(10) << __func__ << ": queueing" << dendl; + + scrub_queued = true; + osd->queue_for_scrub(this, Scrub::scrub_prio_t::low_priority); + return true; +} + +double PG::next_deepscrub_interval() const +{ + double deep_scrub_interval = + pool.info.opts.value_or(pool_opts_t::DEEP_SCRUB_INTERVAL, 0.0); + if (deep_scrub_interval <= 0.0) + deep_scrub_interval = cct->_conf->osd_deep_scrub_interval; + return info.history.last_deep_scrub_stamp + deep_scrub_interval; +} + +bool PG::is_time_for_deep(bool allow_deep_scrub, + bool allow_scrub, + bool has_deep_errors, + const requested_scrub_t& planned) const +{ + dout(10) << __func__ << ": need_auto?" << planned.need_auto << " allow_deep_scrub? " << allow_deep_scrub << dendl; + + if (!allow_deep_scrub) + return false; + + if (planned.need_auto) { + dout(10) << __func__ << ": need repair after scrub errors" << dendl; + return true; } - if (scrubber.local_reserved) { - if (scrubber.reserve_failed) { - dout(20) << __func__ << ": failed, a peer declined" << dendl; - clear_scrub_reserved(); - scrub_unreserve_replicas(); + if (ceph_clock_now() >= next_deepscrub_interval()) + return true; + + if (has_deep_errors) { + osd->clog->info() << "osd." << osd->whoami << " pg " << info.pgid + << " Deep scrub errors, upgrading scrub to deep-scrub"; + return true; + } + + // we only flip coins if 'allow_scrub' is asserted. Otherwise - as this function is + // called often, we will probably be deep-scrubbing most of the time. + if (allow_scrub) { + bool deep_coin_flip = + (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100; + + dout(15) << __func__ << ": time_for_deep=" << planned.time_for_deep + << " deep_coin_flip=" << deep_coin_flip << dendl; + + if (deep_coin_flip) + return true; + } + + return false; +} + +bool PG::verify_periodic_scrub_mode(bool allow_deep_scrub, + bool try_to_auto_repair, + bool allow_regular_scrub, + bool has_deep_errors, + requested_scrub_t& planned) const + +{ + ceph_assert(!planned.must_deep_scrub && !planned.must_repair); + + if (!allow_deep_scrub && has_deep_errors) { + osd->clog->error() + << "osd." << osd->whoami << " pg " << info.pgid + << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set"; return false; - } else if (scrubber.reserved_peers.size() == get_actingset().size()) { - dout(20) << __func__ << ": success, reserved self and replicas" << dendl; - if (scrubber.time_for_deep) { - dout(10) << __func__ << ": scrub will be deep" << dendl; - state_set(PG_STATE_DEEP_SCRUB); - scrubber.time_for_deep = false; + } + + if (allow_deep_scrub) { + // Initial entry and scheduled scrubs without nodeep_scrub set get here + + planned.time_for_deep = + is_time_for_deep(allow_deep_scrub, allow_regular_scrub, has_deep_errors, planned); + + if (try_to_auto_repair) { + if (planned.time_for_deep) { + dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl; + planned.auto_repair = true; + } else if (allow_regular_scrub) { + dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" + << dendl; + planned.deep_scrub_on_error = true; } - queue_scrub(); - } else { - // none declined, since scrubber.reserved is set - dout(20) << __func__ << ": reserved " << scrubber.reserved_peers - << ", waiting for replicas" << dendl; } } + + dout(20) << __func__ << " updated flags: " << planned + << " allow_regular_scrub: " << allow_regular_scrub << dendl; + + // NOSCRUB so skip regular scrubs + if (!allow_regular_scrub && !planned.time_for_deep) { + return false; + } + return true; } -bool PG::is_scrub_registered() +std::optional PG::verify_scrub_mode() const { - return !scrubber.scrub_reg_stamp.is_zero(); -} + dout(10) << __func__ << " processing pg " << info.pgid << dendl; -void PG::reg_next_scrub() -{ - if (!is_primary()) - return; + bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)); + bool allow_regular_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)); + bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0); + bool try_to_auto_repair = + (cct->_conf->osd_scrub_auto_repair && get_pgbackend()->auto_repair_supported()); - utime_t reg_stamp; - bool must = false; - if (scrubber.must_scrub || scrubber.need_auto) { - // Set the smallest time that isn't utime_t() - reg_stamp = Scrubber::scrub_must_stamp(); - must = true; - } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) { - reg_stamp = ceph_clock_now(); - must = true; - } else { - reg_stamp = info.history.last_scrub_stamp; + auto upd_flags = m_planned_scrub; + + upd_flags.time_for_deep = false; + // Clear these in case user issues the scrub/repair command during + // the scheduling of the scrub/repair (e.g. request reservation) + upd_flags.deep_scrub_on_error = false; + upd_flags.auto_repair = false; + + if (upd_flags.must_scrub && !upd_flags.must_deep_scrub && has_deep_errors) { + osd->clog->error() << "osd." << osd->whoami << " pg " << info.pgid + << " Regular scrub request, deep-scrub details will be lost"; + } + + if (!upd_flags.must_scrub) { + // All periodic scrub handling goes here because must_scrub is + // always set for must_deep_scrub and must_repair. + + bool can_start_periodic = + verify_periodic_scrub_mode(allow_deep_scrub, try_to_auto_repair, + allow_regular_scrub, has_deep_errors, upd_flags); + if (!can_start_periodic) { + return std::nullopt; + } } - // note down the sched_time, so we can locate this scrub, and remove it - // later on. - double scrub_min_interval = 0, scrub_max_interval = 0; - pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval); - pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval); - ceph_assert(!is_scrub_registered()); - scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid, - reg_stamp, - scrub_min_interval, - scrub_max_interval, - must); - dout(10) << __func__ << " pg " << pg_id << " register next scrub, scrub time " - << scrubber.scrub_reg_stamp << ", must = " << (int)must << dendl; -} - -void PG::unreg_next_scrub() -{ - if (is_scrub_registered()) { - osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp); - scrubber.scrub_reg_stamp = utime_t(); + + // scrubbing while recovering? + + bool prevented_by_recovery = + osd->is_recovery_active() && !cct->_conf->osd_scrub_during_recovery && + (!cct->_conf->osd_repair_during_recovery || !upd_flags.must_repair); + + if (prevented_by_recovery) { + dout(20) << __func__ << ": scrubbing prevented during recovery" << dendl; + return std::nullopt; } + + upd_flags.need_auto = false; + return upd_flags; +} + +void PG::reg_next_scrub() +{ + m_scrubber->reg_next_scrub(m_planned_scrub); } void PG::on_info_history_change() { - unreg_next_scrub(); - reg_next_scrub(); + m_scrubber->unreg_next_scrub(); + m_scrubber->reg_next_scrub(m_planned_scrub); } -void PG::scrub_requested(bool deep, bool repair, bool need_auto) +void PG::scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) { - unreg_next_scrub(); - if (need_auto) { - scrubber.need_auto = true; - } else { - scrubber.must_scrub = true; - scrubber.must_deep_scrub = deep || repair; - scrubber.must_repair = repair; - // User might intervene, so clear this - scrubber.need_auto = false; - scrubber.req_scrub = true; - } - reg_next_scrub(); + m_scrubber->scrub_requested(scrub_level, scrub_type, m_planned_scrub); } void PG::clear_ready_to_merge() { @@ -1616,6 +1568,7 @@ void PG::on_role_change() { } void PG::on_new_interval() { + dout(20) << __func__ << " scrub_queued was " << scrub_queued << " flags: " << m_planned_scrub << dendl; scrub_queued = false; projected_last_update = eversion_t(); cancel_recovery(); @@ -1698,6 +1651,15 @@ void PG::schedule_event_on_commit( t.register_on_commit(new QueuePeeringEvt(this, on_commit)); } +void PG::on_activate(interval_set snaps) +{ + ceph_assert(!m_scrubber->are_callbacks_pending()); + ceph_assert(callbacks_for_degraded_object.empty()); + snap_trimq = snaps; + release_pg_backoffs(); + projected_last_update = info.last_update; +} + void PG::on_active_exit() { backfill_reserving = false; @@ -1903,133 +1865,6 @@ void PG::on_activate_committed() } } -void PG::do_replica_scrub_map(OpRequestRef op) -{ - auto m = op->get_req(); - dout(7) << __func__ << " " << *m << dendl; - if (m->map_epoch < info.history.same_interval_since) { - dout(10) << __func__ << " discarding old from " - << m->map_epoch << " < " << info.history.same_interval_since - << dendl; - return; - } - if (!scrubber.is_chunky_scrub_active()) { - dout(10) << __func__ << " scrub isn't active" << dendl; - return; - } - - op->mark_started(); - - auto p = const_cast(m->get_data()).cbegin(); - scrubber.received_maps[m->from].decode(p, info.pgid.pool()); - dout(10) << "map version is " - << scrubber.received_maps[m->from].valid_through - << dendl; - - dout(10) << __func__ << " waiting_on_whom was " << scrubber.waiting_on_whom - << dendl; - ceph_assert(scrubber.waiting_on_whom.count(m->from)); - scrubber.waiting_on_whom.erase(m->from); - if (m->preempted) { - dout(10) << __func__ << " replica was preempted, setting flag" << dendl; - scrub_preempted = true; - } - if (scrubber.waiting_on_whom.empty()) { - requeue_scrub(ops_blocked_by_scrub()); - } -} - -// send scrub v3 messages (chunky scrub) -void PG::_request_scrub_map( - pg_shard_t replica, eversion_t version, - hobject_t start, hobject_t end, - bool deep, - bool allow_preemption) -{ - ceph_assert(replica != pg_whoami); - dout(10) << "scrub requesting scrubmap from osd." << replica - << " deep " << (int)deep << dendl; - MOSDRepScrub *repscrubop = new MOSDRepScrub( - spg_t(info.pgid.pgid, replica.shard), version, - get_osdmap_epoch(), - get_last_peering_reset(), - start, end, deep, - allow_preemption, - scrubber.priority, - ops_blocked_by_scrub()); - // default priority, we want the rep scrub processed prior to any recovery - // or client io messages (we are holding a lock!) - osd->send_message_osd_cluster( - replica.osd, repscrubop, get_osdmap_epoch()); -} - -void PG::handle_scrub_reserve_request(OpRequestRef op) -{ - dout(7) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - if (scrubber.remote_reserved) { - dout(10) << __func__ << " ignoring reserve request: Already reserved" - << dendl; - return; - } - if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) && - osd->inc_scrubs_remote()) { - scrubber.remote_reserved = true; - } else { - dout(20) << __func__ << ": failed to reserve remotely" << dendl; - scrubber.remote_reserved = false; - } - auto m = op->get_req(); - Message *reply = new MOSDScrubReserve( - spg_t(info.pgid.pgid, get_primary().shard), - m->map_epoch, - scrubber.remote_reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, - pg_whoami); - osd->send_message_osd_cluster(reply, op->get_req()->get_connection()); -} - -void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) -{ - dout(7) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - if (!scrubber.local_reserved) { - dout(10) << "ignoring obsolete scrub reserve reply" << dendl; - return; - } - if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { - dout(10) << " already had osd." << from << " reserved" << dendl; - } else { - dout(10) << " osd." << from << " scrub reserve = success" << dendl; - scrubber.reserved_peers.insert(from); - sched_scrub(); - } -} - -void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) -{ - dout(7) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - if (!scrubber.local_reserved) { - dout(10) << "ignoring obsolete scrub reserve reply" << dendl; - return; - } - if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) { - dout(10) << " already had osd." << from << " reserved" << dendl; - } else { - /* One decline stops this pg from being scheduled for scrubbing. */ - dout(10) << " osd." << from << " scrub reserve = fail" << dendl; - scrubber.reserve_failed = true; - sched_scrub(); - } -} - -void PG::handle_scrub_reserve_release(OpRequestRef op) -{ - dout(7) << __func__ << " " << *op->get_req() << dendl; - op->mark_started(); - clear_scrub_reserved(); -} - // Compute pending backfill data static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes) { @@ -2117,62 +1952,6 @@ bool PG::try_reserve_recovery_space( void PG::unreserve_recovery_space() { primary_num_bytes.store(0); local_num_bytes.store(0); - return; -} - -void PG::clear_scrub_reserved() -{ - scrubber.reserved_peers.clear(); - scrubber.reserve_failed = false; - - if (scrubber.local_reserved) { - scrubber.local_reserved = false; - osd->dec_scrubs_local(); - } - if (scrubber.remote_reserved) { - scrubber.remote_reserved = false; - osd->dec_scrubs_remote(); - } -} - -void PG::scrub_reserve_replicas() -{ - ceph_assert(recovery_state.get_backfill_targets().empty()); - std::vector> messages; - messages.reserve(get_actingset().size()); - epoch_t e = get_osdmap_epoch(); - for (set::iterator i = get_actingset().begin(); - i != get_actingset().end(); - ++i) { - if (*i == pg_whoami) continue; - dout(10) << "scrub requesting reserve from osd." << *i << dendl; - Message* m = new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e, - MOSDScrubReserve::REQUEST, pg_whoami); - messages.push_back(std::make_pair(i->osd, m)); - } - if (!messages.empty()) { - osd->send_message_osd_cluster(messages, e); - } -} - -void PG::scrub_unreserve_replicas() -{ - ceph_assert(recovery_state.get_backfill_targets().empty()); - std::vector> messages; - messages.reserve(get_actingset().size()); - epoch_t e = get_osdmap_epoch(); - for (set::iterator i = get_actingset().begin(); - i != get_actingset().end(); - ++i) { - if (*i == pg_whoami) continue; - dout(10) << "scrub requesting unreserve from osd." << *i << dendl; - Message* m = new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard), e, - MOSDScrubReserve::RELEASE, pg_whoami); - messages.push_back(std::make_pair(i->osd, m)); - } - if (!messages.empty()) { - osd->send_message_osd_cluster(messages, e); - } } void PG::_scan_rollback_obs(const vector &rollback_obs) @@ -2199,111 +1978,6 @@ void PG::_scan_rollback_obs(const vector &rollback_obs) } } -void PG::_scan_snaps(ScrubMap &smap) -{ - hobject_t head; - SnapSet snapset; - - // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify - // caller using clean_meta_map(), and it works properly. - dout(20) << __func__ << " start" << dendl; - - for (map::reverse_iterator i = smap.objects.rbegin(); - i != smap.objects.rend(); - ++i) { - const hobject_t &hoid = i->first; - ScrubMap::object &o = i->second; - - dout(20) << __func__ << " " << hoid << dendl; - - ceph_assert(!hoid.is_snapdir()); - if (hoid.is_head()) { - // parse the SnapSet - bufferlist bl; - if (o.attrs.find(SS_ATTR) == o.attrs.end()) { - continue; - } - bl.push_back(o.attrs[SS_ATTR]); - auto p = bl.cbegin(); - try { - decode(snapset, p); - } catch(...) { - continue; - } - head = hoid.get_head(); - continue; - } - if (hoid.snap < CEPH_MAXSNAP) { - // check and if necessary fix snap_mapper - if (hoid.get_head() != head) { - derr << __func__ << " no head for " << hoid << " (have " << head << ")" - << dendl; - continue; - } - set obj_snaps; - auto p = snapset.clone_snaps.find(hoid.snap); - if (p == snapset.clone_snaps.end()) { - derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset - << dendl; - continue; - } - obj_snaps.insert(p->second.begin(), p->second.end()); - set cur_snaps; - int r = snap_mapper.get_snaps(hoid, &cur_snaps); - if (r != 0 && r != -ENOENT) { - derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; - ceph_abort(); - } - if (r == -ENOENT || cur_snaps != obj_snaps) { - ObjectStore::Transaction t; - OSDriver::OSTransaction _t(osdriver.get_transaction(&t)); - if (r == 0) { - r = snap_mapper.remove_oid(hoid, &_t); - if (r != 0) { - derr << __func__ << ": remove_oid returned " << cpp_strerror(r) - << dendl; - ceph_abort(); - } - osd->clog->error() << "osd." << osd->whoami - << " found snap mapper error on pg " - << info.pgid - << " oid " << hoid << " snaps in mapper: " - << cur_snaps << ", oi: " - << obj_snaps - << "...repaired"; - } else { - osd->clog->error() << "osd." << osd->whoami - << " found snap mapper error on pg " - << info.pgid - << " oid " << hoid << " snaps missing in mapper" - << ", should be: " - << obj_snaps - << " was " << cur_snaps << " r " << r - << "...repaired"; - } - snap_mapper.add_oid(hoid, obj_snaps, &_t); - - // wait for repair to apply to avoid confusing other bits of the system. - { - ceph::condition_variable my_cond; - ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock"); - int r = 0; - bool done; - t.register_on_applied_sync( - new C_SafeCond(my_lock, my_cond, &done, &r)); - r = osd->store->queue_transaction(ch, std::move(t)); - if (r != 0) { - derr << __func__ << ": queue_transaction got " << cpp_strerror(r) - << dendl; - } else { - std::unique_lock l{my_lock}; - my_cond.wait(l, [&done] { return done;}); - } - } - } - } - } -} void PG::_repair_oinfo_oid(ScrubMap &smap) { @@ -2350,82 +2024,6 @@ void PG::_repair_oinfo_oid(ScrubMap &smap) } } } -int PG::build_scrub_map_chunk( - ScrubMap &map, - ScrubMapBuilder &pos, - hobject_t start, - hobject_t end, - bool deep, - ThreadPool::TPHandle &handle) -{ - dout(10) << __func__ << " [" << start << "," << end << ") " - << " pos " << pos - << dendl; - - // start - while (pos.empty()) { - pos.deep = deep; - map.valid_through = info.last_update; - - // objects - vector rollback_obs; - pos.ret = get_pgbackend()->objects_list_range( - start, - end, - &pos.ls, - &rollback_obs); - if (pos.ret < 0) { - dout(5) << "objects_list_range error: " << pos.ret << dendl; - return pos.ret; - } - if (pos.ls.empty()) { - break; - } - _scan_rollback_obs(rollback_obs); - pos.pos = 0; - return -EINPROGRESS; - } - - // scan objects - while (!pos.done()) { - int r = get_pgbackend()->be_scan_list(map, pos); - if (r == -EINPROGRESS) { - return r; - } - } - - // finish - dout(20) << __func__ << " finishing" << dendl; - ceph_assert(pos.done()); - _repair_oinfo_oid(map); - if (!is_primary()) { - ScrubMap for_meta_scrub; - // In case we restarted smaller chunk, clear old data - scrubber.cleaned_meta_map.clear_from(scrubber.start); - scrubber.cleaned_meta_map.insert(map); - scrubber.clean_meta_map(for_meta_scrub); - _scan_snaps(for_meta_scrub); - } - - dout(20) << __func__ << " done, got " << map.objects.size() << " items" - << dendl; - return 0; -} - -void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) { - if (!store) - return; - struct OnComplete : Context { - std::unique_ptr store; - explicit OnComplete( - std::unique_ptr &&store) - : store(std::move(store)) {} - void finish(int) override {} - }; - store->cleanup(t); - t->register_on_complete(new OnComplete(std::move(store))); - ceph_assert(!store); -} void PG::repair_object( const hobject_t &soid, @@ -2466,950 +2064,170 @@ void PG::repair_object( recovery_state.force_object_missing(bad_peers, soid, oi.version); } -/* replica_scrub - * - * Wait for last_update_applied to match msg->scrub_to as above. Wait - * for pushes to complete in case of recent recovery. Build a single - * scrubmap of objects that are in the range [msg->start, msg->end). - */ -void PG::replica_scrub( - OpRequestRef op, - ThreadPool::TPHandle &handle) +void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle) { - auto msg = op->get_req(); - ceph_assert(!scrubber.active_rep_scrub); - dout(7) << "replica_scrub" << dendl; + dout(10) << __func__ << " (op)" << dendl; + m_scrubber->replica_scrub_op(op); +} - if (msg->map_epoch < info.history.same_interval_since) { - dout(10) << "replica_scrub discarding old replica_scrub from " - << msg->map_epoch << " < " << info.history.same_interval_since - << dendl; - return; - } +void PG::scrub(epoch_t queued, ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl; - ceph_assert(msg->chunky); - if (active_pushes > 0) { - dout(10) << "waiting for active pushes to finish" << dendl; - scrubber.active_rep_scrub = op; - return; - } + scrub_queued = false; - scrubber.state = Scrubber::BUILD_MAP_REPLICA; - scrubber.replica_scrub_start = msg->min_epoch; - scrubber.start = msg->start; - scrubber.end = msg->end; - scrubber.max_end = msg->end; - scrubber.deep = msg->deep; - scrubber.epoch_start = info.history.same_interval_since; - if (msg->priority) { - scrubber.priority = msg->priority; - } else { - scrubber.priority = get_scrub_priority(); + if (pg_has_reset_since(queued)) { + dout(10) << " pg::scrub reset_since " << __func__ << " " << queued << dendl; + dout(10) << " pg::scrub reset_since " << __func__ << " " + << recovery_state.get_last_peering_reset() << dendl; + m_scrubber->scrub_clear_state(false); + return; } - scrub_can_preempt = msg->allow_preemption; - scrub_preempted = false; - scrubber.replica_scrubmap_pos.reset(); + ceph_assert( + is_primary()); // as the replica request should have reached PG::replica_scrub() - requeue_scrub(msg->high_priority); + ceph_assert(!m_scrubber->is_scrub_active()); + // a new scrub + m_scrubber->reset_epoch(queued); + m_scrubber->send_start_scrub(); } -/* Scrub: - * PG_STATE_SCRUBBING is set when the scrub is queued - * - * scrub will be chunky if all OSDs in PG support chunky scrub - * scrub will fail if OSDs are too old. - */ -void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle) -{ - OSDService *osds = osd; - double scrub_sleep = osds->osd->scrub_sleep_time(scrubber.must_scrub); - if (scrub_sleep > 0 && - (scrubber.state == PG::Scrubber::NEW_CHUNK || - scrubber.state == PG::Scrubber::INACTIVE) && - scrubber.needs_sleep) { - ceph_assert(!scrubber.sleeping); - dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl; - - // Do an async sleep so we don't block the op queue - spg_t pgid = get_pgid(); - int state = scrubber.state; - auto scrub_requeue_callback = - new LambdaContext([osds, pgid, state](int r) { - PGRef pg = osds->osd->lookup_lock_pg(pgid); - if (pg == nullptr) { - lgeneric_dout(osds->osd->cct, 20) - << "scrub_requeue_callback: Could not find " - << "PG " << pgid << " can't complete scrub requeue after sleep" - << dendl; - return; - } - pg->scrubber.sleeping = false; - pg->scrubber.needs_sleep = false; - lgeneric_dout(pg->cct, 20) - << "scrub_requeue_callback: slept for " - << ceph_clock_now() - pg->scrubber.sleep_start - << ", re-queuing scrub with state " << state << dendl; - pg->scrub_queued = false; - pg->requeue_scrub(); - pg->scrubber.sleep_start = utime_t(); - pg->unlock(); - }); - std::lock_guard l(osd->sleep_lock); - osd->sleep_timer.add_event_after(scrub_sleep, - scrub_requeue_callback); - scrubber.sleeping = true; - scrubber.sleep_start = ceph_clock_now(); - return; - } - if (pg_has_reset_since(queued)) { - return; - } - ceph_assert(scrub_queued); - scrub_queued = false; - scrubber.needs_sleep = true; +// note: no need to secure OSD resources for a recovery scrub +void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle) +{ + dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued << dendl; - // for the replica - if (!is_primary() && - scrubber.state == PG::Scrubber::BUILD_MAP_REPLICA) { - chunky_scrub(handle); - return; - } + scrub_queued = false; - if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) { - dout(10) << "scrub -- not primary or active or not clean" << dendl; - state_clear(PG_STATE_SCRUBBING); - state_clear(PG_STATE_REPAIR); - state_clear(PG_STATE_DEEP_SCRUB); - publish_stats_to_osd(); + if (pg_has_reset_since(epoch_queued)) { + dout(10) << " reset_since " << __func__ << " " << epoch_queued << dendl; + dout(10) << " reset_since " << __func__ << " " + << recovery_state.get_last_peering_reset() << dendl; return; } - if (!scrubber.active) { - ceph_assert(recovery_state.get_backfill_targets().empty()); - - scrubber.deep = state_test(PG_STATE_DEEP_SCRUB); + ceph_assert(is_primary()); + ceph_assert(!m_scrubber->is_scrub_active()); - dout(10) << "starting a new chunky scrub" << dendl; - } - - chunky_scrub(handle); + // a new scrub + m_scrubber->reset_epoch(epoch_queued); + m_scrubber->send_start_after_repair(); } -void PG::abort_scrub() +void PG::replica_scrub(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - scrub_clear_state(); - scrub_unreserve_replicas(); + dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued + << (is_primary() ? " (primary)" : " (replica)") << dendl; + scrub_queued = false; + m_scrubber->replica_scrub(epoch_queued); } -/* - * Chunky scrub scrubs objects one chunk at a time with writes blocked for that - * chunk. - * - * The object store is partitioned into chunks which end on hash boundaries. For - * each chunk, the following logic is performed: - * - * (1) Block writes on the chunk - * (2) Request maps from replicas - * (3) Wait for pushes to be applied (after recovery) - * (4) Wait for writes to flush on the chunk - * (5) Wait for maps from replicas - * (6) Compare / repair all scrub maps - * (7) Wait for digest updates to apply - * - * This logic is encoded in the mostly linear state machine: - * - * +------------------+ - * _________v__________ | - * | | | - * | INACTIVE | | - * |____________________| | - * | | - * | +----------+ | - * _________v___v______ | | - * | | | | - * | NEW_CHUNK | | | - * |____________________| | | - * | | | - * _________v__________ | | - * | | | | - * | WAIT_PUSHES | | | - * |____________________| | | - * | | | - * _________v__________ | | - * | | | | - * | WAIT_LAST_UPDATE | | | - * |____________________| | | - * | | | - * _________v__________ | | - * | | | | - * | BUILD_MAP | | | - * |____________________| | | - * | | | - * _________v__________ | | - * | | | | - * | WAIT_REPLICAS | | | - * |____________________| | | - * | | | - * _________v__________ | | - * | | | | - * | COMPARE_MAPS | | | - * |____________________| | | - * | | | - * | | | - * _________v__________ | | - * | | | | - * |WAIT_DIGEST_UPDATES | | | - * |____________________| | | - * | | | | - * | +----------+ | - * _________v__________ | - * | | | - * | FINISH | | - * |____________________| | - * | | - * +------------------+ - * - * The primary determines the last update from the subset by walking the log. If - * it sees a log entry pertaining to a file in the chunk, it tells the replicas - * to wait until that update is applied before building a scrub map. Both the - * primary and replicas will wait for any active pushes to be applied. - * - * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq. - * - * scrubber.state encodes the current state of the scrub (refer to state diagram - * for details). - */ -void PG::chunky_scrub(ThreadPool::TPHandle &handle) -{ - // Since repair is only by request and we need to scrub afterward - // treat the same as req_scrub. - if (!scrubber.req_scrub) { - if (state_test(PG_STATE_DEEP_SCRUB)) { - if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || - pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { - dout(10) << "nodeep_scrub set, aborting" << dendl; - abort_scrub(); - return; - } - } else if (state_test(PG_STATE_SCRUBBING)) { - if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { - dout(10) << "noscrub set, aborting" << dendl; - abort_scrub(); - return; - } - } - } - // check for map changes - if (scrubber.is_chunky_scrub_active()) { - if (scrubber.epoch_start != info.history.same_interval_since) { - dout(10) << "scrub pg changed, aborting" << dendl; - abort_scrub(); - return; - } - } - - bool done = false; - int ret; - - while (!done) { - dout(20) << "scrub state " << Scrubber::state_string(scrubber.state) - << " [" << scrubber.start << "," << scrubber.end << ")" - << " max_end " << scrubber.max_end << dendl; - - switch (scrubber.state) { - case PG::Scrubber::INACTIVE: - dout(10) << "scrub start" << dendl; - ceph_assert(is_primary()); - - publish_stats_to_osd(); - scrubber.epoch_start = info.history.same_interval_since; - scrubber.active = true; - - { - ObjectStore::Transaction t; - scrubber.cleanup_store(&t); - scrubber.store.reset(Scrub::Store::create(osd->store, &t, - info.pgid, coll)); - osd->store->queue_transaction(ch, std::move(t), nullptr); - } - - // Don't include temporary objects when scrubbing - scrubber.start = info.pgid.pgid.get_hobj_start(); - scrubber.state = PG::Scrubber::NEW_CHUNK; - - { - bool repair = state_test(PG_STATE_REPAIR); - bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); - const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); - stringstream oss; - oss << info.pgid.pgid << " " << mode << " starts" << std::endl; - osd->clog->debug(oss); - } - - scrubber.preempt_left = cct->_conf.get_val( - "osd_scrub_max_preemptions"); - scrubber.preempt_divisor = 1; - break; - - case PG::Scrubber::NEW_CHUNK: - scrubber.primary_scrubmap = ScrubMap(); - scrubber.received_maps.clear(); - - // begin (possible) preemption window - if (scrub_preempted) { - scrubber.preempt_left--; - scrubber.preempt_divisor *= 2; - dout(10) << __func__ << " preempted, " << scrubber.preempt_left - << " left" << dendl; - scrub_preempted = false; - } - scrub_can_preempt = scrubber.preempt_left > 0; - - { - /* get the start and end of our scrub chunk - * - * Our scrub chunk has an important restriction we're going to need to - * respect. We can't let head be start or end. - * Using a half-open interval means that if end == head, - * we'd scrub/lock head and the clone right next to head in different - * chunks which would allow us to miss clones created between - * scrubbing that chunk and scrubbing the chunk including head. - * This isn't true for any of the other clones since clones can - * only be created "just to the left of" head. There is one exception - * to this: promotion of clones which always happens to the left of the - * left-most clone, but promote_object checks the scrubber in that - * case, so it should be ok. Also, it's ok to "miss" clones at the - * left end of the range if we are a tier because they may legitimately - * not exist (see _scrub). - */ - ceph_assert(scrubber.preempt_divisor > 0); - int min = std::max(3, cct->_conf->osd_scrub_chunk_min / - scrubber.preempt_divisor); - int max = std::max(min, cct->_conf->osd_scrub_chunk_max / - scrubber.preempt_divisor); - hobject_t start = scrubber.start; - hobject_t candidate_end; - vector objects; - ret = get_pgbackend()->objects_list_partial( - start, - min, - max, - &objects, - &candidate_end); - ceph_assert(ret >= 0); - - if (!objects.empty()) { - hobject_t back = objects.back(); - while (candidate_end.is_head() && - candidate_end == back.get_head()) { - candidate_end = back; - objects.pop_back(); - if (objects.empty()) { - ceph_assert(0 == - "Somehow we got more than 2 objects which" - "have the same head but are not clones"); - } - back = objects.back(); - } - if (candidate_end.is_head()) { - ceph_assert(candidate_end != back.get_head()); - candidate_end = candidate_end.get_object_boundary(); - } - } else { - ceph_assert(candidate_end.is_max()); - } - - if (!_range_available_for_scrub(scrubber.start, candidate_end)) { - // we'll be requeued by whatever made us unavailable for scrub - dout(10) << __func__ << ": scrub blocked somewhere in range " - << "[" << scrubber.start << ", " << candidate_end << ")" - << dendl; - done = true; - break; - } - scrubber.end = candidate_end; - if (scrubber.end > scrubber.max_end) - scrubber.max_end = scrubber.end; - } - - // walk the log to find the latest update that affects our chunk - scrubber.subset_last_update = eversion_t(); - for (auto p = projected_log.log.rbegin(); - p != projected_log.log.rend(); - ++p) { - if (p->soid >= scrubber.start && - p->soid < scrubber.end) { - scrubber.subset_last_update = p->version; - break; - } - } - if (scrubber.subset_last_update == eversion_t()) { - for (list::const_reverse_iterator p = - recovery_state.get_pg_log().get_log().log.rbegin(); - p != recovery_state.get_pg_log().get_log().log.rend(); - ++p) { - if (p->soid >= scrubber.start && - p->soid < scrubber.end) { - scrubber.subset_last_update = p->version; - break; - } - } - } - - scrubber.state = PG::Scrubber::WAIT_PUSHES; - break; - - case PG::Scrubber::WAIT_PUSHES: - if (active_pushes == 0) { - scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE; - } else { - dout(15) << "wait for pushes to apply" << dendl; - done = true; - } - break; - - case PG::Scrubber::WAIT_LAST_UPDATE: - if (recovery_state.get_last_update_applied() < - scrubber.subset_last_update) { - // will be requeued by op_applied - dout(15) << "wait for EC read/modify/writes to queue" << dendl; - done = true; - break; - } - - // ask replicas to scan - scrubber.waiting_on_whom.insert(pg_whoami); - - // request maps from replicas - for (set::iterator i = get_acting_recovery_backfill().begin(); - i != get_acting_recovery_backfill().end(); - ++i) { - if (*i == pg_whoami) continue; - _request_scrub_map(*i, scrubber.subset_last_update, - scrubber.start, scrubber.end, scrubber.deep, - scrubber.preempt_left > 0); - scrubber.waiting_on_whom.insert(*i); - } - dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom - << dendl; - - scrubber.state = PG::Scrubber::BUILD_MAP; - scrubber.primary_scrubmap_pos.reset(); - break; - - case PG::Scrubber::BUILD_MAP: - ceph_assert(recovery_state.get_last_update_applied() >= - scrubber.subset_last_update); - - // build my own scrub map - if (scrub_preempted) { - dout(10) << __func__ << " preempted" << dendl; - scrubber.state = PG::Scrubber::BUILD_MAP_DONE; - break; - } - ret = build_scrub_map_chunk( - scrubber.primary_scrubmap, - scrubber.primary_scrubmap_pos, - scrubber.start, scrubber.end, - scrubber.deep, - handle); - if (ret == -EINPROGRESS) { - requeue_scrub(); - done = true; - break; - } - scrubber.state = PG::Scrubber::BUILD_MAP_DONE; - break; - - case PG::Scrubber::BUILD_MAP_DONE: - if (scrubber.primary_scrubmap_pos.ret < 0) { - dout(5) << "error: " << scrubber.primary_scrubmap_pos.ret - << ", aborting" << dendl; - scrub_clear_state(); - scrub_unreserve_replicas(); - return; - } - dout(10) << __func__ << " waiting_on_whom was " - << scrubber.waiting_on_whom << dendl; - ceph_assert(scrubber.waiting_on_whom.count(pg_whoami)); - scrubber.waiting_on_whom.erase(pg_whoami); - - scrubber.state = PG::Scrubber::WAIT_REPLICAS; - break; - - case PG::Scrubber::WAIT_REPLICAS: - if (!scrubber.waiting_on_whom.empty()) { - // will be requeued by do_replica_scrub_map - dout(10) << "wait for replicas to build scrub map" << dendl; - done = true; - break; - } - // end (possible) preemption window - scrub_can_preempt = false; - if (scrub_preempted) { - dout(10) << __func__ << " preempted, restarting chunk" << dendl; - scrubber.state = PG::Scrubber::NEW_CHUNK; - } else { - scrubber.state = PG::Scrubber::COMPARE_MAPS; - } - break; - - case PG::Scrubber::COMPARE_MAPS: - ceph_assert(recovery_state.get_last_update_applied() >= - scrubber.subset_last_update); - ceph_assert(scrubber.waiting_on_whom.empty()); - - scrub_compare_maps(); - scrubber.start = scrubber.end; - scrubber.run_callbacks(); - - // requeue the writes from the chunk that just finished - requeue_ops(waiting_for_scrub); - - scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES; - - // fall-thru - - case PG::Scrubber::WAIT_DIGEST_UPDATES: - if (scrubber.num_digest_updates_pending) { - dout(10) << __func__ << " waiting on " - << scrubber.num_digest_updates_pending - << " digest updates" << dendl; - done = true; - break; - } - - scrubber.preempt_left = cct->_conf.get_val( - "osd_scrub_max_preemptions"); - scrubber.preempt_divisor = 1; - - if (!(scrubber.end.is_max())) { - scrubber.state = PG::Scrubber::NEW_CHUNK; - requeue_scrub(); - done = true; - } else { - scrubber.state = PG::Scrubber::FINISH; - } - - break; - - case PG::Scrubber::FINISH: - scrub_finish(); - scrubber.state = PG::Scrubber::INACTIVE; - done = true; - - if (!snap_trimq.empty()) { - dout(10) << "scrub finished, requeuing snap_trimmer" << dendl; - snap_trimmer_scrub_complete(); - } - - break; - - case PG::Scrubber::BUILD_MAP_REPLICA: - // build my own scrub map - if (scrub_preempted) { - dout(10) << __func__ << " preempted" << dendl; - ret = 0; - } else { - ret = build_scrub_map_chunk( - scrubber.replica_scrubmap, - scrubber.replica_scrubmap_pos, - scrubber.start, scrubber.end, - scrubber.deep, - handle); - } - if (ret == -EINPROGRESS) { - requeue_scrub(); - done = true; - break; - } - // reply - { - MOSDRepScrubMap *reply = new MOSDRepScrubMap( - spg_t(info.pgid.pgid, get_primary().shard), - scrubber.replica_scrub_start, - pg_whoami); - reply->preempted = scrub_preempted; - ::encode(scrubber.replica_scrubmap, reply->get_data()); - osd->send_message_osd_cluster( - get_primary().osd, reply, - scrubber.replica_scrub_start); - } - scrub_preempted = false; - scrub_can_preempt = false; - scrubber.state = PG::Scrubber::INACTIVE; - scrubber.replica_scrubmap = ScrubMap(); - scrubber.replica_scrubmap_pos = ScrubMapBuilder(); - scrubber.start = hobject_t(); - scrubber.end = hobject_t(); - scrubber.max_end = hobject_t(); - done = true; - break; - - default: - ceph_abort(); - } - } - dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state) - << " [" << scrubber.start << "," << scrubber.end << ")" - << " max_end " << scrubber.max_end << dendl; +void PG::scrub_send_scrub_resched(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl; + scrub_queued = false; + m_scrubber->send_scrub_resched(); } -bool PG::write_blocked_by_scrub(const hobject_t& soid) +void PG::scrub_send_resources_granted(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - if (soid < scrubber.start || soid >= scrubber.end) { - return false; - } - if (scrub_can_preempt) { - if (!scrub_preempted) { - dout(10) << __func__ << " " << soid << " preempted" << dendl; - scrub_preempted = true; - } else { - dout(10) << __func__ << " " << soid << " already preempted" << dendl; - } - return false; - } - return true; + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_remotes_reserved(); } -bool PG::range_intersects_scrub(const hobject_t &start, const hobject_t& end) +void PG::scrub_send_resources_denied(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - // does [start, end] intersect [scrubber.start, scrubber.max_end) - return (start < scrubber.max_end && - end >= scrubber.start); + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_reservation_failure(); } -void PG::scrub_clear_state(bool has_error) +void PG::replica_scrub_resched(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - ceph_assert(is_locked()); - state_clear(PG_STATE_SCRUBBING); - if (!has_error) - state_clear(PG_STATE_REPAIR); - state_clear(PG_STATE_DEEP_SCRUB); - publish_stats_to_osd(); - - scrubber.req_scrub = false; - // local -> nothing. - if (scrubber.local_reserved) { - osd->dec_scrubs_local(); - scrubber.local_reserved = false; - scrubber.reserved_peers.clear(); - } - - requeue_ops(waiting_for_scrub); - - scrubber.reset(); - - // type-specific state clear - _scrub_clear_state(); + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + scrub_queued = false; + m_scrubber->replica_scrub_resched(epoch_queued); } -void PG::scrub_compare_maps() +void PG::scrub_send_pushes_update(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " has maps, analyzing" << dendl; - - // construct authoritative scrub map for type specific scrubbing - scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap); - map, - std::optional>> missing_digest; - - map maps; - maps[pg_whoami] = &scrubber.primary_scrubmap; - - for (const auto& i : get_acting_recovery_backfill()) { - if (i == pg_whoami) continue; - dout(2) << __func__ << " replica " << i << " has " - << scrubber.received_maps[i].objects.size() - << " items" << dendl; - maps[i] = &scrubber.received_maps[i]; - } - - set master_set; - - // Construct master set - for (const auto& map : maps) { - for (const auto& i : map.second->objects) { - master_set.insert(i.first); - } + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + if (pg_has_reset_since(epoch_queued)) { + dout(10) << __func__ << " been reset at " + << recovery_state.get_last_peering_reset() << dendl; + return; } + m_scrubber->active_pushes_notification(); +} - stringstream ss; - get_pgbackend()->be_omap_checks(maps, master_set, - scrubber.omap_stats, ss); +void PG::scrub_send_replica_pushes(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_replica_pushes_upd(); +} - if (!ss.str().empty()) { - osd->clog->warn(ss); +void PG::scrub_send_applied_update(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + if (pg_has_reset_since(epoch_queued)) { + dout(10) << __func__ << " been reset at " + << recovery_state.get_last_peering_reset() << dendl; + return; } + m_scrubber->update_applied_notification(epoch_queued); +} - if (recovery_state.get_acting().size() > 1) { - dout(10) << __func__ << " comparing replica scrub maps" << dendl; - - // Map from object with errors to good peer - map> authoritative; - - dout(2) << __func__ << get_primary() << " has " - << scrubber.primary_scrubmap.objects.size() << " items" << dendl; - - ss.str(""); - ss.clear(); - - get_pgbackend()->be_compare_scrubmaps( - maps, - master_set, - state_test(PG_STATE_REPAIR), - scrubber.missing, - scrubber.inconsistent, - authoritative, - missing_digest, - scrubber.shallow_errors, - scrubber.deep_errors, - scrubber.store.get(), - info.pgid, recovery_state.get_acting(), - ss); - dout(2) << ss.str() << dendl; - - if (!ss.str().empty()) { - osd->clog->error(ss); - } - - for (map>::iterator i = authoritative.begin(); - i != authoritative.end(); - ++i) { - list > good_peers; - for (list::const_iterator j = i->second.begin(); - j != i->second.end(); - ++j) { - good_peers.emplace_back(maps[*j]->objects[i->first], *j); - } - scrubber.authoritative.emplace(i->first, good_peers); - } - - for (map>::iterator i = authoritative.begin(); - i != authoritative.end(); - ++i) { - scrubber.cleaned_meta_map.objects.erase(i->first); - scrubber.cleaned_meta_map.objects.insert( - *(maps[i->second.back()]->objects.find(i->first)) - ); - } +void PG::scrub_send_unblocking(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + if (pg_has_reset_since(epoch_queued)) { + dout(10) << __func__ << " been reset at " + << recovery_state.get_last_peering_reset() << dendl; + return; } + m_scrubber->send_scrub_unblock(); +} - ScrubMap for_meta_scrub; - scrubber.clean_meta_map(for_meta_scrub); - - // ok, do the pg-type specific scrubbing - scrub_snapshot_metadata(for_meta_scrub, missing_digest); - // Called here on the primary can use an authoritative map if it isn't the primary - _scan_snaps(for_meta_scrub); - if (!scrubber.store->empty()) { - if (state_test(PG_STATE_REPAIR)) { - dout(10) << __func__ << ": discarding scrub results" << dendl; - scrubber.store->flush(nullptr); - } else { - dout(10) << __func__ << ": updating scrub object" << dendl; - ObjectStore::Transaction t; - scrubber.store->flush(&t); - osd->store->queue_transaction(ch, std::move(t), nullptr); - } - } +void PG::scrub_send_digest_update(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) +{ + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->digest_update_notification(); } -bool PG::scrub_process_inconsistent() +void PG::scrub_send_replmaps_ready(epoch_t epoch_queued, + [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << ": checking authoritative" << dendl; - bool repair = state_test(PG_STATE_REPAIR); - bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); - const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); - - // authoriative only store objects which missing or inconsistent. - if (!scrubber.authoritative.empty()) { - stringstream ss; - ss << info.pgid << " " << mode << " " - << scrubber.missing.size() << " missing, " - << scrubber.inconsistent.size() << " inconsistent objects"; - dout(2) << ss.str() << dendl; - osd->clog->error(ss); - if (repair) { - state_clear(PG_STATE_CLEAN); - for (map >>::iterator i = - scrubber.authoritative.begin(); - i != scrubber.authoritative.end(); - ++i) { - auto missing_entry = scrubber.missing.find(i->first); - if (missing_entry != scrubber.missing.end()) { - repair_object( - i->first, - i->second, - missing_entry->second); - scrubber.fixed += missing_entry->second.size(); - } - if (scrubber.inconsistent.count(i->first)) { - repair_object( - i->first, - i->second, - scrubber.inconsistent[i->first]); - scrubber.fixed += missing_entry->second.size(); - } - } - } - } - return (!scrubber.authoritative.empty() && repair); + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_replica_maps_ready(); } -bool PG::ops_blocked_by_scrub() const { +bool PG::ops_blocked_by_scrub() const +{ return (waiting_for_scrub.size() != 0); } -// the part that actually finalizes a scrub -void PG::scrub_finish() +Scrub::scrub_prio_t PG::is_scrub_blocking_ops() const { - dout(20) << __func__ << dendl; - bool repair = state_test(PG_STATE_REPAIR); - bool do_auto_scrub = false; - // if the repair request comes from auto-repair and large number of errors, - // we would like to cancel auto-repair - if (repair && scrubber.auto_repair - && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) { - state_clear(PG_STATE_REPAIR); - repair = false; - } - bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); - const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub")); - - // if a regular scrub had errors within the limit, do a deep scrub to auto repair. - if (scrubber.deep_scrub_on_error - && scrubber.authoritative.size() - && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) { - ceph_assert(!deep_scrub); - do_auto_scrub = true; - dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl; - } - scrubber.deep_scrub_on_error = false; - - // type-specific finish (can tally more errors) - _scrub_finish(); - - bool has_error = scrub_process_inconsistent(); - - { - stringstream oss; - oss << info.pgid.pgid << " " << mode << " "; - int total_errors = scrubber.shallow_errors + scrubber.deep_errors; - if (total_errors) - oss << total_errors << " errors"; - else - oss << "ok"; - if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) - oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors - << " remaining deep scrub error details lost)"; - if (repair) - oss << ", " << scrubber.fixed << " fixed"; - if (total_errors) - osd->clog->error(oss); - else - osd->clog->debug(oss); - } - - // Since we don't know which errors were fixed, we can only clear them - // when every one has been fixed. - if (repair) { - if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) { - ceph_assert(deep_scrub); - scrubber.shallow_errors = scrubber.deep_errors = 0; - dout(20) << __func__ << " All may be fixed" << dendl; - } else if (has_error) { - // Deep scrub in order to get corrected error counts - scrub_after_recovery = true; - save_req_scrub = scrubber.req_scrub; - dout(20) << __func__ << " Set scrub_after_recovery, req_scrub=" << save_req_scrub << dendl; - } else if (scrubber.shallow_errors || scrubber.deep_errors) { - // We have errors but nothing can be fixed, so there is no repair - // possible. - state_set(PG_STATE_FAILED_REPAIR); - dout(10) << __func__ << " " << (scrubber.shallow_errors + scrubber.deep_errors) - << " error(s) present with no repair possible" << dendl; - } - } - - { - // finish up - ObjectStore::Transaction t; - recovery_state.update_stats( - [this, deep_scrub](auto &history, auto &stats) { - utime_t now = ceph_clock_now(); - history.last_scrub = recovery_state.get_info().last_update; - history.last_scrub_stamp = now; - if (scrubber.deep) { - history.last_deep_scrub = recovery_state.get_info().last_update; - history.last_deep_scrub_stamp = now; - } - - if (deep_scrub) { - if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0)) - history.last_clean_scrub_stamp = now; - stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; - stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors; - stats.stats.sum.num_large_omap_objects = scrubber.omap_stats.large_omap_objects; - stats.stats.sum.num_omap_bytes = scrubber.omap_stats.omap_bytes; - stats.stats.sum.num_omap_keys = scrubber.omap_stats.omap_keys; - dout(25) << "scrub_finish shard " << pg_whoami << " num_omap_bytes = " - << stats.stats.sum.num_omap_bytes << " num_omap_keys = " - << stats.stats.sum.num_omap_keys << dendl; - } else { - stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors; - // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent - // because of deep-scrub errors - if (scrubber.shallow_errors == 0) - history.last_clean_scrub_stamp = now; - } - stats.stats.sum.num_scrub_errors = - stats.stats.sum.num_shallow_scrub_errors + - stats.stats.sum.num_deep_scrub_errors; - if (scrubber.check_repair) { - scrubber.check_repair = false; - if (info.stats.stats.sum.num_scrub_errors) { - state_set(PG_STATE_FAILED_REPAIR); - dout(10) << "scrub_finish " << info.stats.stats.sum.num_scrub_errors - << " error(s) still present after re-scrub" << dendl; - } - } - return true; - }, - &t); - int tr = osd->store->queue_transaction(ch, std::move(t), NULL); - ceph_assert(tr == 0); - } - - if (has_error) { - queue_peering_event( - PGPeeringEventRef( - std::make_shared( - get_osdmap_epoch(), - get_osdmap_epoch(), - PeeringState::DoRecovery()))); - } - - scrub_clear_state(has_error); - scrub_unreserve_replicas(); - - if (do_auto_scrub) { - scrub_requested(false, false, true); - } - - if (is_active() && is_primary()) { - recovery_state.share_pg_info(); - } + return waiting_for_scrub.size() ? Scrub::scrub_prio_t::high_priority + : Scrub::scrub_prio_t::low_priority; } bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) { - if (get_last_peering_reset() > reply_epoch || - get_last_peering_reset() > query_epoch) { - dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch - << " last_peering_reset " << get_last_peering_reset() - << dendl; + if (auto last_reset = get_last_peering_reset(); + last_reset > reply_epoch || last_reset > query_epoch) { + dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " + << query_epoch << " last_peering_reset " << last_reset << dendl; return true; } return false; @@ -3453,24 +2271,12 @@ bool PG::try_flush_or_schedule_async() ostream& operator<<(ostream& out, const PG& pg) { out << pg.recovery_state; - if (pg.scrubber.must_repair) - out << " MUST_REPAIR"; - if (pg.scrubber.auto_repair) - out << " AUTO_REPAIR"; - if (pg.scrubber.check_repair) - out << " CHECK_REPAIR"; - if (pg.scrubber.deep_scrub_on_error) - out << " DEEP_SCRUB_ON_ERROR"; - if (pg.scrubber.must_deep_scrub) - out << " MUST_DEEP_SCRUB"; - if (pg.scrubber.must_scrub) - out << " MUST_SCRUB"; - if (pg.scrubber.time_for_deep) - out << " TIME_FOR_DEEP"; - if (pg.scrubber.need_auto) - out << " NEED_AUTO"; - if (pg.scrubber.req_scrub) - out << " REQ_SCRUB"; + + // listing all scrub-related flags - both current and "planned next scrub" + if (pg.is_scrubbing()) { + out << *pg.m_scrubber; + } + out << pg.m_planned_scrub; if (pg.recovery_ops_active) out << " rops=" << pg.recovery_ops_active; @@ -3596,15 +2402,19 @@ bool PG::can_discard_replica_op(OpRequestRef& op) // resets the messenger sesssion when the replica reconnects. to avoid the // out-of-order replies, the messages from that replica should be discarded. OSDMapRef next_map = osd->get_next_osdmap(); - if (next_map->is_down(from)) + if (next_map->is_down(from)) { + dout(20) << " " << __func__ << " dead for nextmap is down " << from << dendl; return true; + } /* Mostly, this overlaps with the old_peering_msg * condition. An important exception is pushes * sent by replicas not in the acting set, since * if such a replica goes down it does not cause * a new interval. */ - if (next_map->get_down_at(from) >= m->map_epoch) + if (next_map->get_down_at(from) >= m->map_epoch) { + dout(20) << " " << __func__ << " dead for 'get_down_at' " << from << dendl; return true; + } // same pg? // if pg changes _at all_, we reset and repeer! @@ -3798,45 +2608,6 @@ void PG::handle_initialize(PeeringCtx &rctx) recovery_state.handle_event(evt, &rctx); } -void PG::Scrubber::dump(Formatter *f) -{ - f->open_object_section("scrubber"); - f->dump_stream("epoch_start") << epoch_start; - f->dump_bool("active", active); - if (active) { - f->dump_string("state", state_string(state)); - f->dump_stream("start") << start; - f->dump_stream("end") << end; - f->dump_stream("max_end") << max_end; - f->dump_stream("subset_last_update") << subset_last_update; - f->dump_bool("deep", deep); - f->dump_bool("must_scrub", must_scrub); - f->dump_bool("must_deep_scrub", must_deep_scrub); - f->dump_bool("must_repair", must_repair); - f->dump_bool("need_auto", need_auto); - f->dump_bool("req_scrub", req_scrub); - f->dump_bool("time_for_deep", time_for_deep); - f->dump_bool("auto_repair", auto_repair); - f->dump_bool("check_repair", check_repair); - f->dump_bool("deep_scrub_on_error", deep_scrub_on_error); - f->dump_stream("scrub_reg_stamp") << scrub_reg_stamp; //utime_t - f->dump_stream("waiting_on_whom") << waiting_on_whom; //set - f->dump_unsigned("priority", priority); - f->dump_int("shallow_errors", shallow_errors); - f->dump_int("deep_errors", deep_errors); - f->dump_int("fixed", fixed); - { - f->open_array_section("waiting_on_whom"); - for (set::iterator p = waiting_on_whom.begin(); - p != waiting_on_whom.end(); - ++p) { - f->dump_stream("shard") << *p; - } - f->close_section(); - } - } - f->close_section(); -} void PG::handle_query_state(Formatter *f) { @@ -3846,27 +2617,8 @@ void PG::handle_query_state(Formatter *f) // This code has moved to after the close of recovery_state array. // I don't think that scrub is a recovery state - if (is_primary() && is_active()) { - f->open_object_section("scrub"); - f->dump_stream("scrubber.epoch_start") << scrubber.epoch_start; - f->dump_bool("scrubber.active", scrubber.active); - f->dump_string("scrubber.state", PG::Scrubber::state_string(scrubber.state)); - f->dump_stream("scrubber.start") << scrubber.start; - f->dump_stream("scrubber.end") << scrubber.end; - f->dump_stream("scrubber.max_end") << scrubber.max_end; - f->dump_stream("scrubber.subset_last_update") << scrubber.subset_last_update; - f->dump_bool("scrubber.deep", scrubber.deep); - { - f->open_array_section("scrubber.waiting_on_whom"); - for (set::iterator p = scrubber.waiting_on_whom.begin(); - p != scrubber.waiting_on_whom.end(); - ++p) { - f->dump_stream("shard") << *p; - } - f->close_section(); - } - f->dump_string("comment", "DEPRECATED - may be removed in the next release"); - f->close_section(); + if (is_primary() && is_active() && m_scrubber->is_scrub_active()) { + m_scrubber->handle_query_state(f); } } diff --git a/src/osd/PG.h b/src/osd/PG.h index 69f631394ef..9119b297938 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -177,8 +177,13 @@ class PG : public DoutPrefixProvider, public PeeringState::PeeringListener { public: const pg_shard_t pg_whoami; const spg_t pg_id; + + std::unique_ptr m_scrubber; + /// flags detailing scheduling/operation characteristics of the next scrub requested_scrub_t m_planned_scrub; + /// scrubbing state for both Primary & replicas + bool is_scrub_active() const { return m_scrubber->is_scrub_active(); } public: // -- members -- @@ -375,14 +380,27 @@ public: ObjectStore::Transaction &t); void scrub(epoch_t queued, ThreadPool::TPHandle &handle); + /** + * a special version of PG::scrub(), which: + * - is initiated after repair, and + * - is not required to allocate local/remote OSD scrub resources + */ + void recovery_scrub(epoch_t queued, ThreadPool::TPHandle &handle); + void replica_scrub(epoch_t queued, ThreadPool::TPHandle &handle); + void replica_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle); /// Queues a PGScrubResourcesOK message. Will translate into 'RemotesReserved' FSM event void scrub_send_resources_granted(epoch_t queued, ThreadPool::TPHandle &handle); void scrub_send_resources_denied(epoch_t queued, ThreadPool::TPHandle &handle); + void scrub_send_scrub_resched(epoch_t queued, ThreadPool::TPHandle &handle); + void scrub_send_pushes_update(epoch_t queued, ThreadPool::TPHandle &handle); + void scrub_send_applied_update(epoch_t queued, ThreadPool::TPHandle &handle); + void scrub_send_unblocking(epoch_t epoch_queued, ThreadPool::TPHandle &handle); + void scrub_send_digest_update(epoch_t epoch_queued, ThreadPool::TPHandle &handle); + void scrub_send_replmaps_ready(epoch_t epoch_queued, ThreadPool::TPHandle &handle); + void scrub_send_replica_pushes(epoch_t queued, ThreadPool::TPHandle &handle); - bool is_scrub_registered(); void reg_next_scrub(); - void unreg_next_scrub(); void queue_want_pg_temp(const std::vector &wanted) override; void clear_want_pg_temp() override; @@ -398,7 +416,7 @@ public: void on_info_history_change() override; - void scrub_requested(bool deep, bool repair, bool need_auto = false) override; + void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) override; uint64_t get_snap_trimq_size() const override { return snap_trimq.size(); @@ -444,13 +462,7 @@ public: return finish_recovery(); } - void on_activate(interval_set snaps) override { - ceph_assert(scrubber.callbacks.empty()); - ceph_assert(callbacks_for_degraded_object.empty()); - snap_trimq = snaps; - release_pg_backoffs(); - projected_last_update = info.last_update; - } + void on_activate(interval_set snaps) override; void on_activate_committed() override; @@ -526,14 +538,37 @@ public: void shutdown(); virtual void on_shutdown() = 0; - bool get_must_scrub() const { - return scrubber.must_scrub; - } + bool get_must_scrub() const; bool sched_scrub(); unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, unsigned int suggested_priority) const; /// the version that refers to flags_.priority unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const; +private: + // auxiliaries used by sched_scrub(): + double next_deepscrub_interval() const; + + /// should we perform deep scrub? + bool is_time_for_deep(bool allow_deep_scrub, + bool allow_scrub, + bool has_deep_errors, + const requested_scrub_t& planned) const; + + /** + * Verify the various 'next scrub' flags in m_planned_scrub against configuration + * and scrub-related timestamps. + * + * @returns an updated copy of the m_planned_flags (or nothing if no scrubbing) + */ + std::optional verify_scrub_mode() const; + + bool verify_periodic_scrub_mode(bool allow_deep_scrub, + bool try_to_auto_repair, + bool allow_regular_scrub, + bool has_deep_errors, + requested_scrub_t& planned) const; + +public: virtual void do_request( OpRequestRef& op, ThreadPool::TPHandle &handle @@ -946,7 +981,7 @@ protected: pg->get_pgbackend()->trim(entry, t); } }; - + void update_object_snap_mapping( ObjectStore::Transaction *t, const hobject_t &soid, const std::set &snaps); @@ -1013,248 +1048,23 @@ public: hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); release_backoffs(begin, end); } -protected: // -- scrub -- -public: - struct Scrubber { - Scrubber(); - ~Scrubber(); - - // metadata - std::set reserved_peers; - bool local_reserved, remote_reserved, reserve_failed; - epoch_t epoch_start; - - // common to both scrubs - bool active; - std::set waiting_on_whom; - int shallow_errors; - int deep_errors; - int fixed; - ScrubMap primary_scrubmap; - ScrubMapBuilder primary_scrubmap_pos; - epoch_t replica_scrub_start = 0; - ScrubMap replica_scrubmap; - ScrubMapBuilder replica_scrubmap_pos; - std::map received_maps; - OpRequestRef active_rep_scrub; - utime_t scrub_reg_stamp; // stamp we registered for - - static utime_t scrub_must_stamp() { return utime_t(0,1); } - - omap_stat_t omap_stats = (const struct omap_stat_t){ 0 }; - - // For async sleep - bool sleeping = false; - bool needs_sleep = true; - utime_t sleep_start; - - // flags to indicate explicitly requested scrubs (by admin) - bool must_scrub, must_deep_scrub, must_repair, need_auto, req_scrub; - - // Priority to use for scrub scheduling - unsigned priority = 0; - - bool time_for_deep; - // this flag indicates whether we would like to do auto-repair of the PG or not - bool auto_repair; - // this flag indicates that we are scrubbing post repair to verify everything is fixed - bool check_repair; - // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors, - // we should deep scrub in order to auto repair - bool deep_scrub_on_error; - - // Maps from objects with errors to missing/inconsistent peers - std::map> missing; - std::map> inconsistent; - - // Std::map from object with errors to good peers - std::map >> authoritative; - - // Cleaned std::map pending snap metadata scrub - ScrubMap cleaned_meta_map; - - void clean_meta_map(ScrubMap &for_meta_scrub) { - if (end.is_max() || - cleaned_meta_map.objects.empty()) { - cleaned_meta_map.swap(for_meta_scrub); - } else { - auto iter = cleaned_meta_map.objects.end(); - --iter; // not empty, see if clause - auto begin = cleaned_meta_map.objects.begin(); - if (iter->first.has_snapset()) { - ++iter; - } else { - while (iter != begin) { - auto next = iter--; - if (next->first.get_head() != iter->first.get_head()) { - ++iter; - break; - } - } - } - for_meta_scrub.objects.insert(begin, iter); - cleaned_meta_map.objects.erase(begin, iter); - } - } - - // digest updates which we are waiting on - int num_digest_updates_pending; - - // chunky scrub - hobject_t start, end; // [start,end) - hobject_t max_end; // Largest end that may have been sent to replicas - eversion_t subset_last_update; - - // chunky scrub state - enum State { - INACTIVE, - NEW_CHUNK, - WAIT_PUSHES, - WAIT_LAST_UPDATE, - BUILD_MAP, - BUILD_MAP_DONE, - WAIT_REPLICAS, - COMPARE_MAPS, - WAIT_DIGEST_UPDATES, - FINISH, - BUILD_MAP_REPLICA, - } state; - - std::unique_ptr store; - // deep scrub - bool deep; - int preempt_left; - int preempt_divisor; - - std::list callbacks; - void add_callback(Context *context) { - callbacks.push_back(context); - } - void run_callbacks() { - std::list to_run; - to_run.swap(callbacks); - for (std::list::iterator i = to_run.begin(); - i != to_run.end(); - ++i) { - (*i)->complete(0); - } - } - - static const char *state_string(const PG::Scrubber::State& state) { - const char *ret = NULL; - switch( state ) - { - case INACTIVE: ret = "INACTIVE"; break; - case NEW_CHUNK: ret = "NEW_CHUNK"; break; - case WAIT_PUSHES: ret = "WAIT_PUSHES"; break; - case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break; - case BUILD_MAP: ret = "BUILD_MAP"; break; - case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break; - case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break; - case COMPARE_MAPS: ret = "COMPARE_MAPS"; break; - case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break; - case FINISH: ret = "FINISH"; break; - case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break; - } - return ret; - } - - bool is_chunky_scrub_active() const { return state != INACTIVE; } - - // clear all state - void reset() { - active = false; - waiting_on_whom.clear(); - if (active_rep_scrub) { - active_rep_scrub = OpRequestRef(); - } - received_maps.clear(); - - must_scrub = false; - must_deep_scrub = false; - must_repair = false; - need_auto = false; - req_scrub = false; - time_for_deep = false; - auto_repair = false; - check_repair = false; - deep_scrub_on_error = false; - - state = PG::Scrubber::INACTIVE; - start = hobject_t(); - end = hobject_t(); - max_end = hobject_t(); - subset_last_update = eversion_t(); - shallow_errors = 0; - deep_errors = 0; - fixed = 0; - omap_stats = (const struct omap_stat_t){ 0 }; - deep = false; - run_callbacks(); - inconsistent.clear(); - missing.clear(); - authoritative.clear(); - num_digest_updates_pending = 0; - primary_scrubmap = ScrubMap(); - primary_scrubmap_pos.reset(); - replica_scrubmap = ScrubMap(); - replica_scrubmap_pos.reset(); - cleaned_meta_map = ScrubMap(); - sleeping = false; - needs_sleep = true; - sleep_start = utime_t(); - } - - void create_results(const hobject_t& obj); - void cleanup_store(ObjectStore::Transaction *t); - void dump(ceph::Formatter *f); - } scrubber; - protected: bool scrub_after_recovery; - bool save_req_scrub; // Saved for scrub_after_recovery int active_pushes; - bool scrub_can_preempt = false; - bool scrub_preempted = false; - - // we allow some number of preemptions of the scrub, which mean we do - // not block. then we start to block. once we start blocking, we do - // not stop until the scrub range is completed. - bool write_blocked_by_scrub(const hobject_t &soid); - - /// true if the given range intersects the scrub interval in any way - bool range_intersects_scrub(const hobject_t &start, const hobject_t& end); - void repair_object( const hobject_t &soid, const std::list > &ok_peers, const std::set &bad_peers); - void abort_scrub(); - void chunky_scrub(ThreadPool::TPHandle &handle); - void scrub_compare_maps(); - /** - * return true if any inconsistency/missing is repaired, false otherwise - */ - bool scrub_process_inconsistent(); - bool ops_blocked_by_scrub() const; - void scrub_finish(); - void scrub_clear_state(bool keep_repair = false); - void _scan_snaps(ScrubMap &map); + [[nodiscard]] bool ops_blocked_by_scrub() const; + [[nodiscard]] Scrub::scrub_prio_t is_scrub_blocking_ops() const; + void _repair_oinfo_oid(ScrubMap &map); void _scan_rollback_obs(const std::vector &rollback_obs); - void _request_scrub_map(pg_shard_t replica, eversion_t version, - hobject_t start, hobject_t end, bool deep, - bool allow_preemption); - int build_scrub_map_chunk( - ScrubMap &map, - ScrubMapBuilder &pos, - hobject_t start, hobject_t end, bool deep, - ThreadPool::TPHandle &handle); /** * returns true if [begin, end) is good to scrub at this time * a false return value obliges the implementer to requeue scrub when the @@ -1262,27 +1072,12 @@ protected: */ virtual bool _range_available_for_scrub( const hobject_t &begin, const hobject_t &end) = 0; - virtual void scrub_snapshot_metadata( - ScrubMap &map, - const std::map, - std::optional>> &missing_digest) { } - virtual void _scrub_clear_state() { } - virtual void _scrub_finish() { } - void clear_scrub_reserved(); - void scrub_reserve_replicas(); - void scrub_unreserve_replicas(); - bool scrub_all_replicas_reserved() const; - - void replica_scrub( - OpRequestRef op, - ThreadPool::TPHandle &handle); - void do_replica_scrub_map(OpRequestRef op); - - void handle_scrub_reserve_request(OpRequestRef op); - void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); - void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); - void handle_scrub_reserve_release(OpRequestRef op); + + /** + * Initiate the process that will create our scrub map for the Primary. + * (triggered by MSG_OSD_REP_SCRUB) + */ + void replica_scrub(OpRequestRef op, ThreadPool::TPHandle &handle); // -- recovery state -- @@ -1332,7 +1127,7 @@ protected: bool is_clean() const { return recovery_state.is_clean(); } bool is_degraded() const { return recovery_state.is_degraded(); } bool is_undersized() const { return recovery_state.is_undersized(); } - bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } + bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } // Primary only bool is_remapped() const { return recovery_state.is_remapped(); } bool is_peered() const { return recovery_state.is_peered(); } bool is_recovering() const { return recovery_state.is_recovering(); } @@ -1395,10 +1190,10 @@ protected: virtual void kick_snap_trim() = 0; virtual void snap_trimmer_scrub_complete() = 0; - bool requeue_scrub(bool high_priority = false); + void queue_recovery(); - bool queue_scrub(); - unsigned get_scrub_priority(); + void queue_scrub_after_repair(); + unsigned int get_scrub_priority(); bool try_flush_or_schedule_async() override; void start_flush_on_transaction( diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index d036c1d00a6..98b8fc561c9 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -263,7 +263,7 @@ public: /// Notify that info/history changed (generally to update scrub registration) virtual void on_info_history_change() = 0; /// Notify that a scrub has been requested - virtual void scrub_requested(bool deep, bool repair, bool need_auto = false) = 0; + virtual void scrub_requested(scrub_level_t scrub_level, scrub_type_t scrub_type) = 0; /// Return current snap_trimq size virtual uint64_t get_snap_trimq_size() const = 0; @@ -502,12 +502,12 @@ public: }; struct RequestScrub : boost::statechart::event { - bool deep; - bool repair; - explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {} + scrub_level_t deep; + scrub_type_t repair; + explicit RequestScrub(bool d, bool r) : deep(scrub_level_t(d)), repair(scrub_type_t(r)) {} void print(std::ostream *out) const { - *out << "RequestScrub(" << (deep ? "deep" : "shallow") - << (repair ? " repair" : ""); + *out << "RequestScrub(" << ((deep==scrub_level_t::deep) ? "deep" : "shallow") + << ((repair==scrub_type_t::do_repair) ? " repair)" : ")"); } }; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index e06001401b7..edba0886707 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -18,6 +18,7 @@ #include "boost/tuple/tuple.hpp" #include "boost/intrusive_ptr.hpp" #include "PG.h" +#include "pg_scrubber.h" #include "PrimaryLogPG.h" #include "OSD.h" #include "OpRequest.h" @@ -937,7 +938,7 @@ PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter) if (r != 0) { derr << "Error opening class '" << class_name << "': " << cpp_strerror(r) << dendl; - if (r != -EPERM) // propogate permission error + if (r != -EPERM) // propagate permission error r = -EINVAL; return { r, nullptr }; } else { @@ -1010,7 +1011,7 @@ void PrimaryLogPG::do_command( f->close_section(); if (is_primary() && is_active()) { - scrubber.dump(f.get()); + m_scrubber->dump(f.get()); } f->open_object_section("agent_state"); @@ -1591,24 +1592,24 @@ int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op) dout(10) << " corrupted scrub_ls_arg_t" << dendl; return -EINVAL; } + int r = 0; scrub_ls_result_t result = {.interval = info.history.same_interval_since}; + if (arg.interval != 0 && arg.interval != info.history.same_interval_since) { r = -EAGAIN; - } else if (!scrubber.store) { - r = -ENOENT; - } else if (arg.get_snapsets) { - result.vals = scrubber.store->get_snap_errors(get_pgid().pool(), - arg.start_after, - arg.max_return); } else { - result.vals = scrubber.store->get_object_errors(get_pgid().pool(), - arg.start_after, - arg.max_return); + bool store_queried = m_scrubber->get_store_errors(arg, result); + if (!store_queried) { + // the scrubber's store is not initialized + r = -ENOENT; + } } - encode(result, osd_op->outdata); + encode(result, osd_op->outdata); // RRR really? even if no store? + return r; } +} PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap, const PGPool &_pool, @@ -1621,11 +1622,14 @@ PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap, new_backfill(false), temp_seq(0), snap_trimmer_machine(this) -{ +{ recovery_state.set_backend_predicates( pgbackend->get_is_readable_predicate(), pgbackend->get_is_recoverable_predicate()); snap_trimmer_machine.initiate(); + + m_scrubber = make_unique(this); // *not* the final code + // next commit: m_scrubber = make_unique(this); } void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc) @@ -1790,16 +1794,16 @@ void PrimaryLogPG::do_request( auto m = op->get_req(); switch (m->type) { case MOSDScrubReserve::REQUEST: - handle_scrub_reserve_request(op); + m_scrubber->handle_scrub_reserve_request(op); break; case MOSDScrubReserve::GRANT: - handle_scrub_reserve_grant(op, m->from); + m_scrubber->handle_scrub_reserve_grant(op, m->from); break; case MOSDScrubReserve::REJECT: - handle_scrub_reserve_reject(op, m->from); + m_scrubber->handle_scrub_reserve_reject(op, m->from); break; case MOSDScrubReserve::RELEASE: - handle_scrub_reserve_release(op); + m_scrubber->handle_scrub_reserve_release(op); break; } } @@ -2051,7 +2055,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op) return; } - if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) { + if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) { dout(20) << __func__ << ": waiting for scrub" << dendl; waiting_for_scrub.push_back(op); op->mark_delayed("waiting for scrub"); @@ -2416,7 +2420,7 @@ PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail( return cache_result_t::BLOCKED_RECOVERY; } - if (write_blocked_by_scrub(head)) { + if (m_scrubber->write_blocked_by_scrub(head)) { dout(20) << __func__ << ": waiting for scrub" << dendl; waiting_for_scrub.push_back(op); op->mark_delayed("waiting for scrub"); @@ -3750,7 +3754,7 @@ void PrimaryLogPG::promote_object(ObjectContextRef obc, { hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid; ceph_assert(hoid != hobject_t()); - if (write_blocked_by_scrub(hoid)) { + if (m_scrubber->write_blocked_by_scrub(hoid)) { dout(10) << __func__ << " " << hoid << " blocked by scrub" << dendl; if (op) { @@ -8668,16 +8672,7 @@ void PrimaryLogPG::apply_stats( } } - if (is_primary() && scrubber.active) { - if (soid < scrubber.start) { - dout(20) << __func__ << " " << soid << " < [" << scrubber.start - << "," << scrubber.end << ")" << dendl; - scrub_cstat.add(delta_stats); - } else { - dout(20) << __func__ << " " << soid << " >= [" << scrubber.start - << "," << scrubber.end << ")" << dendl; - } - } + m_scrubber->stats_of_handled_objects(delta_stats, soid); } void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx) @@ -10579,7 +10574,7 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop) } if (!fop->blocking && - write_blocked_by_scrub(oid)) { + m_scrubber->write_blocked_by_scrub(oid)) { if (fop->op) { dout(10) << __func__ << " blocked by scrub" << dendl; requeue_op(fop->op); @@ -10796,15 +10791,9 @@ void PrimaryLogPG::op_applied(const eversion_t &applied_version) ceph_assert(applied_version != eversion_t()); ceph_assert(applied_version <= info.last_update); recovery_state.local_write_applied(applied_version); - if (is_primary()) { - if (scrubber.active) { - if (recovery_state.get_last_update_applied() >= - scrubber.subset_last_update) { - requeue_scrub(ops_blocked_by_scrub()); - } - } else { - ceph_assert(scrubber.start == scrubber.end); - } + + if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) { + osd->queue_scrub_applied_update(this, is_scrub_blocking_ops()); } } @@ -11231,11 +11220,11 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch) return; } - if (write_blocked_by_scrub(obc->obs.oi.soid)) { + if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) { dout(10) << "handle_watch_timeout waiting for scrub on obj " << obc->obs.oi.soid << dendl; - scrubber.add_callback( + m_scrubber->add_callback( watch->get_delayed_cb() // This callback! ); return; @@ -11693,11 +11682,15 @@ void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc) } if (obc->requeue_scrub_on_unblock) { + obc->requeue_scrub_on_unblock = false; + + dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl; + // only requeue if we are still active: we may be unblocking // because we are resetting for a new peering interval if (is_active()) { - requeue_scrub(); + osd->queue_scrub_unblocking(this, is_scrub_blocking_ops()); } } } @@ -11932,9 +11925,10 @@ void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc) --active_pushes; // requeue an active chunky scrub waiting on recovery ops - if (!recovery_state.is_deleting() && active_pushes == 0 - && scrubber.is_chunky_scrub_active()) { - requeue_scrub(ops_blocked_by_scrub()); + if (!recovery_state.is_deleting() && active_pushes == 0 && + m_scrubber->is_scrub_active()) { + + osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops()); } } @@ -11944,20 +11938,11 @@ void PrimaryLogPG::_applied_recovered_object_replica() ceph_assert(active_pushes >= 1); --active_pushes; - // requeue an active chunky scrub waiting on recovery ops + // requeue an active scrub waiting on recovery ops if (!recovery_state.is_deleting() && active_pushes == 0 && - scrubber.active_rep_scrub && static_cast( - scrubber.active_rep_scrub->get_req())->chunky) { - auto& op = scrubber.active_rep_scrub; - osd->enqueue_back( - OpSchedulerItem( - unique_ptr(new PGOpItem(info.pgid, op)), - op->get_req()->get_cost(), - op->get_req()->get_priority(), - op->get_req()->get_recv_stamp(), - op->get_req()->get_source().num(), - get_osdmap_epoch())); - scrubber.active_rep_scrub.reset(); + m_scrubber->is_scrub_active()) { + + osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority()); } } @@ -12366,10 +12351,9 @@ void PrimaryLogPG::on_shutdown() osd->clear_queued_recovery(this); } - clear_scrub_reserved(); - scrub_clear_state(); + m_scrubber->scrub_clear_state(); - unreg_next_scrub(); + m_scrubber->unreg_next_scrub(); vector tids; cancel_copy_ops(false, &tids); @@ -12488,7 +12472,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t) requeue_ops(waiting_for_active); requeue_ops(waiting_for_readable); - clear_scrub_reserved(); + m_scrubber->clear_scrub_reservations(); vector tids; cancel_copy_ops(is_primary(), &tids); @@ -12518,7 +12502,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t) } // requeues waiting_for_scrub - scrub_clear_state(); + m_scrubber->scrub_clear_state(); for (auto p = waiting_for_blocked_object.begin(); p != waiting_for_blocked_object.end(); @@ -12561,7 +12545,7 @@ void PrimaryLogPG::on_change(ObjectStore::Transaction &t) context_registry_on_change(); pgbackend->on_change_cleanup(&t); - scrubber.cleanup_store(&t); + m_scrubber->cleanup_store(&t); pgbackend->on_change(); // clear snap_trimmer state @@ -12613,6 +12597,8 @@ void PrimaryLogPG::_clear_recovery_state() #ifdef DEBUG_RECOVERY_OIDS recovering_oids.clear(); #endif + dout(15) << __func__ << " flags: " << m_planned_scrub << dendl; + last_backfill_started = hobject_t(); set::iterator i = backfills_in_flight.begin(); while (i != backfills_in_flight.end()) { @@ -13884,7 +13870,7 @@ void PrimaryLogPG::hit_set_remove_all() // Once we hit a degraded object just skip if (is_degraded_or_backfilling_object(aoid)) return; - if (write_blocked_by_scrub(aoid)) + if (m_scrubber->write_blocked_by_scrub(aoid)) return; } @@ -14004,7 +13990,7 @@ void PrimaryLogPG::hit_set_persist() // Once we hit a degraded object just skip further trim if (is_degraded_or_backfilling_object(aoid)) return; - if (write_blocked_by_scrub(aoid)) + if (m_scrubber->write_blocked_by_scrub(aoid)) return; } @@ -14037,7 +14023,7 @@ void PrimaryLogPG::hit_set_persist() new_hset.using_gmt); // If the current object is degraded we skip this persist request - if (write_blocked_by_scrub(oid)) + if (m_scrubber->write_blocked_by_scrub(oid)) return; hit_set->seal(); @@ -14284,7 +14270,7 @@ bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota) osd->logger->inc(l_osd_agent_skip); continue; } - if (range_intersects_scrub(obc->obs.oi.soid, + if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid, obc->obs.oi.soid.get_head())) { dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; osd->logger->inc(l_osd_agent_skip); @@ -14487,7 +14473,7 @@ bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush) return false; } // This is already checked by agent_work() which passes after_flush = false - if (after_flush && range_intersects_scrub(soid, soid.get_head())) { + if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) { dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl; return false; } @@ -14911,9 +14897,20 @@ bool PrimaryLogPG::already_complete(eversion_t v) // ========================================================================================== // SCRUB +void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op) +{ + dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl; + op->mark_started(); + + if (!m_scrubber->is_scrub_active()) { + dout(10) << __func__ << " scrub isn't active" << dendl; + return; + } + m_scrubber->map_from_replica(op); +} -bool PrimaryLogPG::_range_available_for_scrub( - const hobject_t &begin, const hobject_t &end) +bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin, + const hobject_t& end) { pair next; next.second = object_contexts.lookup(begin); @@ -15526,7 +15523,7 @@ boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&) ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl; return discard_event(); } - if (pg->scrubber.active) { + if (pg->m_scrubber->is_scrub_active()) { ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl; return transit< WaitScrub >(); } else { @@ -15741,6 +15738,10 @@ bool PrimaryLogPG::check_failsafe_full() { return osd->check_failsafe_full(get_dpp()); } +bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid) +{ + return m_scrubber->write_blocked_by_scrub(oid); +} void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); } void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index bc682332dba..a85c4f85ed4 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -574,6 +574,11 @@ public: OstreamTemp clog_error() override { return osd->clog->error(); } OstreamTemp clog_warn() override { return osd->clog->warn(); } + /** + * a scrub-map arrived from a replica + */ + void do_replica_scrub_map(OpRequestRef op); + struct watch_disconnect_t { uint64_t cookie; entity_name_t name; @@ -912,49 +917,10 @@ protected: * Releases locks * * @param manager [in] manager with locks to release + * + * (moved to .cc due to scrubber access) */ - void release_object_locks( - ObcLockManager &lock_manager) { - std::list > > to_req; - bool requeue_recovery = false; - bool requeue_snaptrim = false; - lock_manager.put_locks( - &to_req, - &requeue_recovery, - &requeue_snaptrim); - if (requeue_recovery) - queue_recovery(); - if (requeue_snaptrim) - snap_trimmer_machine.process_event(TrimWriteUnblocked()); - - if (!to_req.empty()) { - // requeue at front of scrub blocking queue if we are blocked by scrub - for (auto &&p: to_req) { - if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) { - for (auto& op : p.second) { - op->mark_delayed("waiting for scrub"); - } - - waiting_for_scrub.splice( - waiting_for_scrub.begin(), - p.second, - p.second.begin(), - p.second.end()); - } else if (is_laggy()) { - for (auto& op : p.second) { - op->mark_delayed("waiting for readable"); - } - waiting_for_readable.splice( - waiting_for_readable.begin(), - p.second, - p.second.begin(), - p.second.end()); - } else { - requeue_ops(p.second); - } - } - } - } + void release_object_locks(ObcLockManager &lock_manager); // replica ops // [primary|tail] @@ -1964,9 +1930,7 @@ public: void on_removal(ObjectStore::Transaction &t) override; void on_shutdown() override; bool check_failsafe_full() override; - bool maybe_preempt_replica_scrub(const hobject_t& oid) override { - return write_blocked_by_scrub(oid); - } + bool maybe_preempt_replica_scrub(const hobject_t& oid) override; int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx); // attr cache handling diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc index b6af7e07fc7..bcabad41f73 100644 --- a/src/osd/pg_scrubber.cc +++ b/src/osd/pg_scrubber.cc @@ -34,6 +34,20 @@ template static ostream& _prefix(std::ostream* _dout, T* t) return t->gen_prefix(*_dout) << " scrubber pg(" << t->pg_id << ") "; } +ostream& operator<<(ostream& out, const scrub_flags_t& sf) +{ + if (sf.auto_repair) + out << " AUTO_REPAIR"; + if (sf.check_repair) + out << " CHECK_REPAIR"; + if (sf.deep_scrub_on_error) + out << " DEEP_SCRUB_ON_ERROR"; + if (sf.required) + out << " REQ_SCRUB"; + + return out; +} + ostream& operator<<(ostream& out, const requested_scrub_t& sf) { if (sf.must_repair) @@ -58,6 +72,1822 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf) return out; } +bool PgScrubber::is_event_relevant(epoch_t queued) const +{ + return is_primary() && m_pg->is_active() && m_pg->is_clean() && is_scrub_active() && + !was_epoch_changed() && (!queued || !m_pg->pg_has_reset_since(queued)); +} + +bool PgScrubber::should_abort_scrub(epoch_t queued) const +{ + dout(10) << __func__ << "(): queued:" << queued << " required: " << m_flags.required + << " noscrub: " << get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) << " / " + << m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB) << dendl; + + if (!is_primary() || !m_pg->is_active() || + (queued && m_pg->pg_has_reset_since(queued))) { + return true; + } + + if (m_flags.required) { + return false; // not stopping 'required' scrubs for configuration changes + } + + if (state_test(PG_STATE_DEEP_SCRUB)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { + dout(10) << "nodeep_scrub set, aborting" << dendl; + return true; + } + } else if (state_test(PG_STATE_SCRUBBING)) { + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + return true; + } + } + + return false; +} + +void PgScrubber::send_start_scrub() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + if (should_abort_scrub(epoch_t(0))) { + dout(10) << __func__ << " aborting!" << dendl; + scrub_clear_state(false); + } else { + m_fsm->my_states(); + m_fsm->process_event(StartScrub{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_start_after_repair() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + m_fsm->process_event(AfterRepairScrub{}); + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_unblock() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + if (should_abort_scrub(epoch_t(0))) { + + dout(10) << __func__ << " aborting!" << dendl; + scrub_clear_state(false); + + } else if (is_scrub_active()) { + + m_fsm->my_states(); + m_fsm->process_event(Unblocked{}); + + } else { + dout(10) << __func__ << " ignored as scrub not active" << dendl; + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_scrub_resched() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + if (should_abort_scrub(epoch_t(0))) { + dout(10) << __func__ << " aborting!" << dendl; + scrub_clear_state(false); + } else if (is_scrub_active()) { + m_fsm->my_states(); + m_fsm->process_event(InternalSchedScrub{}); + } else { + // no need to send anything + dout(10) << __func__ << " event no longer relevant" << dendl; + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_start_replica() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + m_fsm->process_event(StartReplica{}); + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_sched_replica() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + m_fsm->process_event(SchedReplica{}); // retest for map availability + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::active_pushes_notification() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + if (should_abort_scrub(epoch_t(0))) { + dout(10) << __func__ << " aborting!" << dendl; + scrub_clear_state(false); + } else { + m_fsm->my_states(); + m_fsm->process_event(ActivePushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::update_applied_notification(epoch_t epoch_queued) +{ + dout(10) << "scrubber event -->> " << __func__ << "() epoch: " << epoch_queued << dendl; + if (should_abort_scrub(epoch_queued)) { + dout(10) << __func__ << " aborting!" << dendl; + scrub_clear_state(false); + } else { + m_fsm->my_states(); + m_fsm->process_event(UpdatesApplied{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::digest_update_notification() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + if (is_event_relevant(epoch_t(0))) { + m_fsm->process_event(DigestUpdate{}); + } else { + // no need to send anything + dout(10) << __func__ << " event no longer relevant" << dendl; + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_epoch_changed() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + if (is_scrub_active()) { + m_fsm->my_states(); + m_fsm->process_event(EpochChanged{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_maps_ready() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + if (is_scrub_active()) { + m_fsm->process_event(GotReplicas{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_replica_pushes_upd() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + if (is_scrub_active()) { + m_fsm->process_event(ReplicaPushesUpd{}); + } + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_remotes_reserved() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + m_fsm->process_event(RemotesReserved{}); // note: too early to check for 'active'! + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +void PgScrubber::send_reservation_failure() +{ + dout(10) << "scrubber event -->> " << __func__ << dendl; + m_fsm->my_states(); + m_fsm->process_event(ReservationFailure{}); // do not check for 'active'! + dout(10) << "scrubber event --<< " << __func__ << dendl; +} + +bool PgScrubber::is_scrub_active() const +{ + dout(10) << " " << __func__ << " actv? " << m_active << "pg:" << m_pg->pg_id << dendl; + return m_active; +} + +bool PgScrubber::is_reserving() const +{ + return m_fsm->is_reserving(); +} + +void PgScrubber::reset_epoch(epoch_t epoch_queued) +{ + dout(10) << __func__ << " PG( " << m_pg->pg_id + << (m_pg->is_primary() ? ") prm" : ") rpl") << " epoch: " << epoch_queued + << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; + + dout(10) << __func__ << " STATE_SCRUBBING? " << state_test(PG_STATE_SCRUBBING) << dendl; + m_epoch_queued = epoch_queued; + m_needs_sleep = true; + + m_fsm->assert_not_active(); + + m_is_deep = state_test(PG_STATE_DEEP_SCRUB); +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const +{ + unsigned int qu_priority = m_flags.priority; + + if (with_priority == Scrub::scrub_prio_t::high_priority) { + qu_priority = + std::max(qu_priority, (unsigned int)m_pg->cct->_conf->osd_client_op_priority); + } + return qu_priority; +} + +unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const +{ + if (with_priority == Scrub::scrub_prio_t::high_priority) { + suggested_priority = std::max(suggested_priority, + (unsigned int)m_pg->cct->_conf->osd_client_op_priority); + } + return suggested_priority; +} + +// ///////////////////////////////////////////////////////////////////// // +// scrub op registration handling + +bool PgScrubber::is_scrub_registered() const +{ + return !m_scrub_reg_stamp.is_zero(); +} + +void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags) +{ + if (!is_primary()) { + dout(20) << __func__ << ": not a primary!" << dendl; + return; + } + + dout(10) << __func__ << " planned.m.s: " << request_flags.must_scrub + << ": planned.n.a.: " << request_flags.need_auto + << " stamp: " << m_pg->info.history.last_scrub_stamp << dendl; + + ceph_assert(!is_scrub_registered()); + + utime_t reg_stamp; + bool must = false; + + if (request_flags.must_scrub || request_flags.need_auto) { + // Set the smallest time that isn't utime_t() + reg_stamp = PgScrubber::scrub_must_stamp(); + must = true; + } else if (m_pg->info.stats.stats_invalid && + m_pg->cct->_conf->osd_scrub_invalid_stats) { + reg_stamp = ceph_clock_now(); + must = true; + } else { + reg_stamp = m_pg->info.history.last_scrub_stamp; + } + + dout(9) << __func__ << " pg(" << m_pg_id << ") must: " << must + << " required:" << m_flags.required << " flags: " << request_flags + << " stamp: " << reg_stamp << dendl; + + // note down the sched_time, so we can locate this scrub, and remove it + // later on. + double scrub_min_interval = 0; + double scrub_max_interval = 0; + m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval); + m_pg->pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval); + + m_scrub_reg_stamp = m_osds->reg_pg_scrub(m_pg->info.pgid, reg_stamp, scrub_min_interval, + scrub_max_interval, must); + dout(15) << __func__ << " pg(" << m_pg_id << ") register next scrub, scrub time " + << m_scrub_reg_stamp << ", must = " << (int)must << dendl; +} + +void PgScrubber::unreg_next_scrub() +{ + if (is_scrub_registered()) { + m_osds->unreg_pg_scrub(m_pg->info.pgid, m_scrub_reg_stamp); + m_scrub_reg_stamp = utime_t{}; + } +} + +/// debug/development temporary code: +void PgScrubber::debug_dump_reservations(std::string_view header_txt) const +{ + std::string format; + auto f = Formatter::create(format, "json-pretty", "json-pretty"); + m_osds->dump_scrub_reservations(f); + std::stringstream o; + f->flush(o); + dout(20) << header_txt << o.str() << dendl; + delete f; +} + +void PgScrubber::scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) +{ + dout(10) << __func__ << (scrub_level == scrub_level_t::deep ? " deep " : " shallow ") + << (scrub_type == scrub_type_t::do_repair ? " repair-scrub " : " not-repair ") + << " prev stamp: " << m_scrub_reg_stamp << " " << is_scrub_registered() + << dendl; + + debug_dump_reservations(" before_unreg "); + + unreg_next_scrub(); + + req_flags.must_scrub = true; + req_flags.must_deep_scrub = + (scrub_level == scrub_level_t::deep) || (scrub_type == scrub_type_t::do_repair); + req_flags.must_repair = (scrub_type == scrub_type_t::do_repair); + // User might intervene, so clear this + req_flags.need_auto = false; + req_flags.req_scrub = true; + + dout(20) << __func__ << " pg(" << m_pg_id << ") planned:" << req_flags << dendl; + debug_dump_reservations(" before_reg "); + + reg_next_scrub(req_flags); + + debug_dump_reservations(" after_reg "); +} + +void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags) +{ + dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## " + << is_scrub_registered() << dendl; + debug_dump_reservations(" auto-scrub before "); + + unreg_next_scrub(); + req_flags.need_auto = true; + reg_next_scrub(req_flags); + + debug_dump_reservations(" auto-scrub after "); +} + +bool PgScrubber::reserve_local() +{ + // try to create the reservation object (which translates into asking the + // OSD for the local scrub resource). If failing - undo it immediately + + m_local_osd_resource.emplace(m_pg, m_osds); + if (!m_local_osd_resource->is_reserved()) { + m_local_osd_resource.reset(); + return false; + } + + return true; +} + +// ---------------------------------------------------------------------------- + +bool PgScrubber::has_pg_marked_new_updates() const +{ + auto last_applied = m_pg->recovery_state.get_last_update_applied(); + dout(10) << __func__ << " recovery last: " << last_applied + << " vs. scrub's: " << m_subset_last_update << dendl; + + return last_applied >= m_subset_last_update; +} + +void PgScrubber::set_subset_last_update(eversion_t e) +{ + m_subset_last_update = e; +} + +/* + * setting: + * - m_subset_last_update + * - m_max_end + * - end + * - start + * By: + * - setting tentative range based on conf and divisor + * - requesting a partial list of elements from the backend; + * - handling some head/clones issues + * - ... + * + * The selected range is set directly into 'm_start' and 'm_end' + */ +bool PgScrubber::select_range() +{ + m_primary_scrubmap = ScrubMap{}; + m_received_maps.clear(); + + /* get the start and end of our scrub chunk + * + * Our scrub chunk has an important restriction we're going to need to + * respect. We can't let head be start or end. + * Using a half-open interval means that if end == head, + * we'd scrub/lock head and the clone right next to head in different + * chunks which would allow us to miss clones created between + * scrubbing that chunk and scrubbing the chunk including head. + * This isn't true for any of the other clones since clones can + * only be created "just to the left of" head. There is one exception + * to this: promotion of clones which always happens to the left of the + * left-most clone, but promote_object checks the scrubber in that + * case, so it should be ok. Also, it's ok to "miss" clones at the + * left end of the range if we are a tier because they may legitimately + * not exist (see _scrub). + */ + int min_idx = std::max( + 3, m_pg->get_cct()->_conf->osd_scrub_chunk_min / preemption_data.chunk_divisor()); + + int max_idx = std::max(min_idx, m_pg->get_cct()->_conf->osd_scrub_chunk_max / + preemption_data.chunk_divisor()); + + // why mixing 'int' and int64_t? RRR + + dout(10) << __func__ << " Min: " << min_idx << " Max: " << max_idx + << " Div: " << preemption_data.chunk_divisor() << dendl; + + hobject_t start = m_start; + hobject_t candidate_end; + std::vector objects; + int ret = m_pg->get_pgbackend()->objects_list_partial(start, min_idx, max_idx, &objects, + &candidate_end); + ceph_assert(ret >= 0); + + if (!objects.empty()) { + + hobject_t back = objects.back(); + while (candidate_end.is_head() && candidate_end == back.get_head()) { + candidate_end = back; + objects.pop_back(); + if (objects.empty()) { + ceph_assert(0 == + "Somehow we got more than 2 objects which" + "have the same head but are not clones"); + } + back = objects.back(); + } + + if (candidate_end.is_head()) { + ceph_assert(candidate_end != back.get_head()); + candidate_end = candidate_end.get_object_boundary(); + } + + } else { + ceph_assert(candidate_end.is_max()); + } + + // is that range free for us? if not - we will be rescheduled later by whoever + // triggered us this time + + if (!m_pg->_range_available_for_scrub(m_start, candidate_end)) { + // we'll be requeued by whatever made us unavailable for scrub + dout(10) << __func__ << ": scrub blocked somewhere in range " + << "[" << m_start << ", " << candidate_end << ")" << dendl; + return false; + } + + m_end = candidate_end; + if (m_end > m_max_end) + m_max_end = m_end; + + dout(15) << __func__ << " range selected: " << m_start << " //// " << m_end << " //// " + << m_max_end << dendl; + return true; +} + +bool PgScrubber::write_blocked_by_scrub(const hobject_t& soid) +{ + if (soid < m_start || soid >= m_end) { + return false; + } + + dout(10) << __func__ << " " << soid << " can preempt? " + << preemption_data.is_preemptable() << dendl; + dout(10) << __func__ << " " << soid << " already? " << preemption_data.was_preempted() + << dendl; + + if (preemption_data.is_preemptable()) { + + if (!preemption_data.was_preempted()) { + dout(10) << __func__ << " " << soid << " preempted" << dendl; + + // signal the preemption + preemption_data.do_preempt(); + + } else { + dout(10) << __func__ << " " << soid << " already preempted" << dendl; + } + return false; + } + return true; +} + +bool PgScrubber::range_intersects_scrub(const hobject_t& start, const hobject_t& end) +{ + // does [start, end] intersect [scrubber.start, scrubber.m_max_end) + return (start < m_max_end && end >= m_start); +} + +/** + * if we are required to sleep: + * arrange a callback sometimes later. + * be sure to be able to identify a stale callback. + * Otherwise: perform a requeue (i.e. - rescheduling thru the OSD queue) + * anyway. + */ +void PgScrubber::add_delayed_scheduling() +{ + milliseconds sleep_time{0ms}; + if (m_needs_sleep) { + double scrub_sleep = 1000.0 * m_osds->osd->scrub_sleep_time(m_flags.required); + dout(10) << __func__ << " sleep: " << scrub_sleep << dendl; + sleep_time = milliseconds{long(scrub_sleep)}; + } + dout(15) << __func__ << " sleep: " << sleep_time.count() << " needed? " << m_needs_sleep + << dendl; + + if (sleep_time.count()) { + // schedule a transition for some 'sleep_time' ms in the future + + m_needs_sleep = false; + m_sleep_started_at = ceph_clock_now(); + + // the 'delayer' for crimson is different. Will be factored out. + + spg_t pgid = m_pg->get_pgid(); + auto callbk = new LambdaContext([osds = m_osds, pgid, + scrbr = this]([[maybe_unused]] int r) mutable { + PGRef pg = osds->osd->lookup_lock_pg(pgid); + if (!pg) { + lgeneric_subdout(g_ceph_context, osd, 10) + << "scrub_requeue_callback: Could not find " + << "PG " << pgid << " can't complete scrub requeue after sleep" << dendl; + return; + } + scrbr->m_needs_sleep = true; + lgeneric_dout(scrbr->get_pg_cct(), 7) + << "scrub_requeue_callback: slept for " + << ceph_clock_now() - scrbr->m_sleep_started_at << ", re-queuing scrub" << dendl; + + scrbr->m_sleep_started_at = utime_t{}; + osds->queue_for_scrub_resched(&(*pg), Scrub::scrub_prio_t::low_priority); + pg->unlock(); + }); + + std::lock_guard l(m_osds->sleep_lock); + m_osds->sleep_timer.add_event_after(sleep_time.count() / 1000.0f, callbk); + + } else { + // just a requeue + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); + } +} + +/** + * walk the log to find the latest update that affects our chunk + */ +eversion_t PgScrubber::search_log_for_updates() const +{ + auto& projected = m_pg->projected_log.log; + auto pi = find_if( + projected.crbegin(), projected.crend(), + [this](const auto& e) -> bool { return e.soid >= m_start && e.soid < m_end; }); + + if (pi != projected.crend()) + return pi->version; + + // there was no relevant update entry in the log + + auto& log = m_pg->recovery_state.get_pg_log().get_log().log; + auto p = find_if(log.crbegin(), log.crend(), [this](const auto& e) -> bool { + return e.soid >= m_start && e.soid < m_end; + }); + + if (p == log.crend()) + return eversion_t{}; + else + return p->version; +} + +bool PgScrubber::get_replicas_maps(bool replica_can_preempt) +{ + dout(10) << __func__ << " epoch_start: " << m_epoch_start + << " pg same_interval_since: " << m_pg->info.history.same_interval_since + << dendl; + + bool do_have_replicas = false; + + m_primary_scrubmap_pos.reset(); + + // ask replicas to scan and send maps + for (const auto& i : m_pg->get_acting_recovery_backfill()) { + + if (i == m_pg_whoami) + continue; + + do_have_replicas = true; + m_maps_status.mark_replica_map_request(i); + _request_scrub_map(i, m_subset_last_update, m_start, m_end, m_is_deep, + replica_can_preempt); + } + + dout(10) << __func__ << " awaiting" << m_maps_status << dendl; + return do_have_replicas; +} + +bool PgScrubber::was_epoch_changed() const +{ + // for crimson we have m_pg->get_info().history.same_interval_since + dout(10) << __func__ << " epoch_start: " << m_epoch_start + << " from pg: " << m_pg->get_history().same_interval_since << dendl; + + return m_epoch_start < m_pg->get_history().same_interval_since; +} + +void PgScrubber::mark_local_map_ready() +{ + m_maps_status.mark_local_map_ready(); +} + +bool PgScrubber::are_all_maps_available() const +{ + return m_maps_status.are_all_maps_available(); +} + +std::string PgScrubber::dump_awaited_maps() const +{ + return m_maps_status.dump(); +} + +void PgScrubber::_request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption) +{ + ceph_assert(replica != m_pg_whoami); + dout(10) << __func__ << " scrubmap from osd." << replica + << (deep ? " deep" : " shallow") << dendl; + + auto repscrubop = new MOSDRepScrub( + spg_t(m_pg->info.pgid.pgid, replica.shard), version, m_pg->get_osdmap_epoch(), + m_pg->get_last_peering_reset(), start, end, deep, allow_preemption, m_flags.priority, + m_pg->ops_blocked_by_scrub()); + + // default priority. We want the replica-scrub processed prior to any recovery + // or client io messages (we are holding a lock!) + m_osds->send_message_osd_cluster(replica.osd, repscrubop, get_osdmap_epoch()); +} + +void PgScrubber::cleanup_store(ObjectStore::Transaction* t) +{ + if (!m_store) + return; + + struct OnComplete : Context { + std::unique_ptr store; + explicit OnComplete(std::unique_ptr&& store) : store(std::move(store)) + {} + void finish(int) override {} + }; + m_store->cleanup(t); + t->register_on_complete(new OnComplete(std::move(m_store))); + ceph_assert(!m_store); +} + +void PgScrubber::on_init() +{ + // going upwards from 'inactive' + ceph_assert(!is_scrub_active()); + + preemption_data.reset(); + m_pg->publish_stats_to_osd(); + m_epoch_start = m_pg->get_history().same_interval_since; + + dout(10) << __func__ << " start same_interval:" << m_epoch_start << dendl; + + // create a new store + { + ObjectStore::Transaction t; + cleanup_store(&t); + m_store.reset( + Scrub::Store::create(m_pg->osd->store, &t, m_pg->info.pgid, m_pg->coll)); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + + m_start = m_pg->info.pgid.pgid.get_hobj_start(); + m_active = true; +} + +void PgScrubber::on_replica_init() +{ + ceph_assert(!m_active); + m_active = true; +} + +void PgScrubber::_scan_snaps(ScrubMap& smap) +{ + hobject_t head; + SnapSet snapset; + + // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify + // caller using clean_meta_map(), and it works properly. + dout(15) << __func__ << " starts" << dendl; + + for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) { + + const hobject_t& hoid = i->first; + ScrubMap::object& o = i->second; + + dout(20) << __func__ << " " << hoid << dendl; + + ceph_assert(!hoid.is_snapdir()); + if (hoid.is_head()) { + // parse the SnapSet + bufferlist bl; + if (o.attrs.find(SS_ATTR) == o.attrs.end()) { + continue; + } + bl.push_back(o.attrs[SS_ATTR]); + auto p = bl.cbegin(); + try { + decode(snapset, p); + } catch (...) { + continue; + } + head = hoid.get_head(); + continue; + } + + if (hoid.snap < CEPH_MAXSNAP) { + // check and if necessary fix snap_mapper + if (hoid.get_head() != head) { + derr << __func__ << " no head for " << hoid << " (have " << head << ")" << dendl; + continue; + } + set obj_snaps; + auto p = snapset.clone_snaps.find(hoid.snap); + if (p == snapset.clone_snaps.end()) { + derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset << dendl; + continue; + } + obj_snaps.insert(p->second.begin(), p->second.end()); + set cur_snaps; + int r = m_pg->snap_mapper.get_snaps(hoid, &cur_snaps); + if (r != 0 && r != -ENOENT) { + derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + if (r == -ENOENT || cur_snaps != obj_snaps) { + ObjectStore::Transaction t; + OSDriver::OSTransaction _t(m_pg->osdriver.get_transaction(&t)); + if (r == 0) { + r = m_pg->snap_mapper.remove_oid(hoid, &_t); + if (r != 0) { + derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl; + ceph_abort(); + } + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps in mapper: " << cur_snaps + << ", oi: " << obj_snaps << "...repaired"; + } else { + m_pg->osd->clog->error() + << "osd." << m_pg->osd->whoami << " found snap mapper error on pg " + << m_pg->info.pgid << " oid " << hoid << " snaps missing in mapper" + << ", should be: " << obj_snaps << " was " << cur_snaps << " r " << r + << "...repaired"; + } + m_pg->snap_mapper.add_oid(hoid, obj_snaps, &_t); + + // wait for repair to apply to avoid confusing other bits of the system. + { + dout(15) << __func__ << " wait on repair!" << dendl; + + ceph::condition_variable my_cond; + ceph::mutex my_lock = ceph::make_mutex("PG::_scan_snaps my_lock"); + int e = 0; + bool done; + + t.register_on_applied_sync(new C_SafeCond(my_lock, my_cond, &done, &e)); + + e = m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t)); + if (e != 0) { + derr << __func__ << ": queue_transaction got " << cpp_strerror(e) << dendl; + } else { + std::unique_lock l{my_lock}; + my_cond.wait(l, [&done] { return done; }); + } + } + } + } + } +} + +int PgScrubber::build_primary_map_chunk() +{ + auto ret = build_scrub_map_chunk(m_primary_scrubmap, m_primary_scrubmap_pos, m_start, + m_end, m_is_deep); + + if (ret == -EINPROGRESS) + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); + + return ret; +} + +int PgScrubber::build_replica_map_chunk() +{ + dout(10) << __func__ << " epoch start: " << m_epoch_start << " ep q: " << m_epoch_queued + << dendl; + dout(10) << __func__ << " deep: " << m_is_deep << dendl; + + auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end, + m_is_deep); + + if (ret == 0) { + + // finished! + // In case we restarted smaller chunk, clear old data + + ScrubMap for_meta_scrub; + m_cleaned_meta_map.clear_from(m_start); + m_cleaned_meta_map.insert(replica_scrubmap); + clean_meta_map(for_meta_scrub); + _scan_snaps(for_meta_scrub); + } + + // previous version used low priority here. Now switched to using the priority + // of the original message + if (ret == -EINPROGRESS) + requeue_replica(m_replica_request_priority); + + return ret; +} + +int PgScrubber::build_scrub_map_chunk( + ScrubMap& map, ScrubMapBuilder& pos, hobject_t start, hobject_t end, bool deep) +{ + dout(10) << __func__ << " [" << start << "," << end << ") " + << " pos " << pos << " Deep: " << deep << dendl; + + // start + while (pos.empty()) { + + pos.deep = deep; + map.valid_through = m_pg->info.last_update; + + // objects + vector rollback_obs; + pos.ret = + m_pg->get_pgbackend()->objects_list_range(start, end, &pos.ls, &rollback_obs); + dout(10) << __func__ << " while pos empty " << pos.ret << dendl; + if (pos.ret < 0) { + dout(5) << "objects_list_range error: " << pos.ret << dendl; + return pos.ret; + } + dout(10) << __func__ << " pos.ls.empty()? " << (pos.ls.empty() ? "+" : "-") << dendl; + if (pos.ls.empty()) { + break; + } + m_pg->_scan_rollback_obs(rollback_obs); + pos.pos = 0; + return -EINPROGRESS; + } + + // scan objects + while (!pos.done()) { + int r = m_pg->get_pgbackend()->be_scan_list(map, pos); + dout(10) << __func__ << " be r " << r << dendl; + if (r == -EINPROGRESS) { + dout(8 /*20*/) << __func__ << " in progress" << dendl; + return r; + } + } + + // finish + dout(8 /*20*/) << __func__ << " finishing" << dendl; + ceph_assert(pos.done()); + m_pg->_repair_oinfo_oid(map); + + dout(8 /*20*/) << __func__ << " done, got " << map.objects.size() << " items" << dendl; + return 0; +} + +/** + * \todo describe what we are doing here + * + * @param for_meta_scrub + */ +void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub) +{ + if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) { + m_cleaned_meta_map.swap(for_meta_scrub); + } else { + auto iter = m_cleaned_meta_map.objects.end(); + --iter; // not empty, see 'if' clause + auto begin = m_cleaned_meta_map.objects.begin(); + if (iter->first.has_snapset()) { + ++iter; + } else { + while (iter != begin) { + auto next = iter--; + if (next->first.get_head() != iter->first.get_head()) { + ++iter; + break; + } + } + } + for_meta_scrub.objects.insert(begin, iter); + m_cleaned_meta_map.objects.erase(begin, iter); + } +} + +void PgScrubber::run_callbacks() +{ + std::list to_run; + to_run.swap(m_callbacks); + + for (auto& tr : to_run) { + tr->complete(0); + } +} + +void PgScrubber::maps_compare_n_cleanup() +{ + scrub_compare_maps(); + m_start = m_end; + run_callbacks(); + requeue_waiting(); +} + +Scrub::preemption_t* PgScrubber::get_preemptor() +{ + return &preemption_data; +} + +void PgScrubber::requeue_replica(Scrub::scrub_prio_t is_high_priority) +{ + dout(10) << __func__ << dendl; + m_osds->queue_for_rep_scrub_resched(m_pg, is_high_priority, m_flags.priority); +} + +/* + * Process note: called for the arriving "give me your map, replica!" request. Unlike + * the original implementation, we do not requeue the Op waiting for + * updates. Instead - we trigger the FSM. + */ +void PgScrubber::replica_scrub_op(OpRequestRef op) +{ + auto msg = op->get_req(); + dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch + << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl; + + if (msg->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch + << " < " << m_pg->info.history.same_interval_since << dendl; + return; + } + + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos = ScrubMapBuilder{}; + + // m_replica_epoch_start is overwritten if requeued waiting for active pushes + m_replica_epoch_start = m_pg->info.history.same_interval_since; + m_replica_min_epoch = msg->min_epoch; + m_start = msg->start; + m_end = msg->end; + m_max_end = msg->end; + m_is_deep = msg->deep; + m_epoch_start = m_pg->info.history.same_interval_since; + m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority + : Scrub::scrub_prio_t::low_priority; + m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority(); + + preemption_data.reset(); + preemption_data.force_preemptability(msg->allow_preemption); + + replica_scrubmap_pos.reset(); + + // make sure the FSM is at NotActive + m_fsm->assert_not_active(); + + m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority); +} + +void PgScrubber::replica_scrub(epoch_t epoch_queued) +{ + dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued + << dendl; + dout(20) << __func__ << " m_epoch_start: " << m_epoch_start + << " better be >= " << m_pg->info.history.same_interval_since << dendl; + dout(20) << __func__ << " m_is_deep: " << m_is_deep << dendl; + + if (m_pg->pg_has_reset_since(epoch_queued)) { + dout(10) << "replica_scrub(epoch,) - reset!" << dendl; + send_epoch_changed(); + return; + } + + if (was_epoch_changed()) { + dout(10) << "replica_scrub(epoch,) - epoch!" << dendl; + send_epoch_changed(); + return; + } + ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check + + send_start_replica(); +} + +void PgScrubber::replica_scrub_resched(epoch_t epoch_queued) +{ + dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued + << dendl; + + if (m_pg->pg_has_reset_since(epoch_queued)) { + dout(10) << "replica_scrub(epoch,) - reset!" << dendl; + send_epoch_changed(); + return; + } + + if (was_epoch_changed()) { + dout(10) << __func__ << " epoch changed!" << dendl; + send_epoch_changed(); + return; + } + ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check + + send_sched_replica(); +} + +void PgScrubber::set_op_parameters(requested_scrub_t& request) +{ + dout(10) << __func__ << " input: " << request << dendl; + + m_flags.check_repair = request.check_repair; + m_flags.auto_repair = request.auto_repair || request.need_auto; + m_flags.required = request.req_scrub || request.must_scrub; + + m_flags.priority = (request.must_scrub || request.need_auto) + ? get_pg_cct()->_conf->osd_requested_scrub_priority + : m_pg->get_scrub_priority(); + + state_set(PG_STATE_SCRUBBING); + + // will we be deep-scrubbing? + if (request.must_deep_scrub || request.need_auto || request.time_for_deep) { + state_set(PG_STATE_DEEP_SCRUB); + } + + if (request.must_repair || m_flags.auto_repair) { + state_set(PG_STATE_REPAIR); + } + + // the publishing here seems to be required for tests synchronization + m_pg->publish_stats_to_osd(); + m_flags.deep_scrub_on_error = request.deep_scrub_on_error; + request = requested_scrub_t{}; +} + +/** + * RRR \todo ask why we collect from acting+recovery+backfill, but use the size of + * only the acting set + */ +void PgScrubber::scrub_compare_maps() +{ + dout(10) << __func__ << " has maps, analyzing" << dendl; + + // construct authoritative scrub map for type-specific scrubbing + m_cleaned_meta_map.insert(m_primary_scrubmap); + map, std::optional>> missing_digest; + + map maps; + maps[m_pg_whoami] = &m_primary_scrubmap; + + for (const auto& i : m_pg->get_acting_recovery_backfill()) { + if (i == m_pg_whoami) + continue; + dout(2) << __func__ << " replica " << i << " has " + << m_received_maps[i].objects.size() << " items" << dendl; + maps[i] = &m_received_maps[i]; + } + + set master_set; + + // Construct master set + for (const auto& map : maps) { + for (const auto& i : map.second->objects) { + master_set.insert(i.first); + } + } + + stringstream ss; + m_pg->get_pgbackend()->be_omap_checks(maps, master_set, m_omap_stats, ss); + + if (!ss.str().empty()) { + m_osds->clog->warn(ss); + } + + if (m_pg->recovery_state.get_acting().size() > 1) { + + // RRR add a comment here + + dout(10) << __func__ << " comparing replica scrub maps" << dendl; + + // Map from object with errors to good peer + map> authoritative; + + dout(2) << __func__ << m_pg->get_primary() << " has " + << m_primary_scrubmap.objects.size() << " items" << dendl; + + ss.str(""); + ss.clear(); + + m_pg->get_pgbackend()->be_compare_scrubmaps( + maps, master_set, state_test(PG_STATE_REPAIR), m_missing, m_inconsistent, + authoritative, missing_digest, m_shallow_errors, m_deep_errors, m_store.get(), + m_pg->info.pgid, m_pg->recovery_state.get_acting(), ss); + dout(2) << ss.str() << dendl; + + if (!ss.str().empty()) { + m_osds->clog->error(ss); + } + + for (auto& i : authoritative) { + list> good_peers; + for (list::const_iterator j = i.second.begin(); j != i.second.end(); + ++j) { + good_peers.emplace_back(maps[*j]->objects[i.first], *j); + } + m_authoritative.emplace(i.first, good_peers); + } + + for (auto i = authoritative.begin(); i != authoritative.end(); ++i) { + m_cleaned_meta_map.objects.erase(i->first); + m_cleaned_meta_map.objects.insert( + *(maps[i->second.back()]->objects.find(i->first))); + } + } + + ScrubMap for_meta_scrub; + clean_meta_map(for_meta_scrub); + + // ok, do the pg-type specific scrubbing + + // (Validates consistency of the object info and snap sets) + scrub_snapshot_metadata(for_meta_scrub, missing_digest); + + // Called here on the primary can use an authoritative map if it isn't the primary + _scan_snaps(for_meta_scrub); + + if (!m_store->empty()) { + + if (state_test(PG_STATE_REPAIR)) { + dout(10) << __func__ << ": discarding scrub results" << dendl; + m_store->flush(nullptr); + } else { + dout(10) << __func__ << ": updating scrub object" << dendl; + ObjectStore::Transaction t; + m_store->flush(&t); + m_pg->osd->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + } + } +} + +void PgScrubber::replica_update_start_epoch() +{ + dout(10) << __func__ << " start:" << m_pg->info.history.same_interval_since << dendl; + m_replica_epoch_start = m_pg->info.history.same_interval_since; +} + +/** + * Send the requested map back to the primary (or - if we + * were preempted - let the primary know). + */ +void PgScrubber::send_replica_map(bool was_preempted) +{ + dout(10) << __func__ << " min epoch:" << m_replica_min_epoch + << " epoch_start:" << m_replica_epoch_start << dendl; + + auto reply = new MOSDRepScrubMap(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), + m_replica_min_epoch, m_pg_whoami); + + reply->preempted = was_preempted; + ::encode(replica_scrubmap, reply->get_data()); + + m_osds->send_message_osd_cluster(m_pg->get_primary().osd, reply, m_replica_min_epoch); +} + +/** + * - if the replica lets us know it was interrupted, we mark the chunk as interrupted. + * The state-machine will react to that when all replica maps are received. + * - when all maps are received, we signal the FSM with the GotReplicas event (see + * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the + * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to + * handle (well - the incoming message is marked for fast dispatching, which is an + * even better reason for handling it via the queue). + */ +void PgScrubber::map_from_replica(OpRequestRef op) +{ + auto m = op->get_req(); + dout(15) << __func__ << " " << *m << dendl; + + if (m->map_epoch < m_pg->info.history.same_interval_since) { + dout(10) << __func__ << " discarding old from " << m->map_epoch << " < " + << m_pg->info.history.same_interval_since << dendl; + return; + } + + auto p = const_cast(m->get_data()).cbegin(); + + m_received_maps[m->from].decode(p, m_pg->info.pgid.pool()); + dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl; + + [[maybe_unused]] auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); + ceph_assert(is_ok); // and not an error message, following the original code + + if (m->preempted) { + dout(10) << __func__ << " replica was preempted, setting flag" << dendl; + ceph_assert(preemption_data.is_preemptable()); // otherwise - how dare the replica! + preemption_data.do_preempt(); + } + + if (m_maps_status.are_all_maps_available()) { + dout(10) << __func__ << " osd-queuing GotReplicas" << dendl; + m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops()); + } +} + +/** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ +void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_reserved()) { + dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl; + return; + } + + bool granted{false}; + + if (m_pg->cct->_conf->osd_scrub_during_recovery || !m_osds->is_recovery_active()) { + + m_remote_osd_resource.emplace(m_pg, m_osds); + // OSD resources allocated? + granted = m_remote_osd_resource->is_reserved(); + if (!granted) { + // just forget it + m_remote_osd_resource.reset(); + dout(20) << __func__ << ": failed to reserve remotely" << dendl; + } + } + + dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl; + + auto m = op->get_req(); + Message* reply = new MOSDScrubReserve( + spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m->map_epoch, + granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami); + + m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection()); +} + +void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + m_reservations->handle_reserve_grant(op, from); + } else { + derr << __func__ << ": replica scrub reservations that will be leaked!" << dendl; + } +} + +void PgScrubber::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + + if (m_reservations.has_value()) { + // there is an active reservation process. No action is required otherwise. + m_reservations->handle_reserve_reject(op, from); + } +} + +void PgScrubber::handle_scrub_reserve_release(OpRequestRef op) +{ + dout(10) << __func__ << " " << *op->get_req() << dendl; + op->mark_started(); + m_remote_osd_resource.reset(); +} + +void PgScrubber::clear_scrub_reservations() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); // the remote reservations + m_local_osd_resource.reset(); // the local reservation + m_remote_osd_resource.reset(); // we as replica reserved for a Primary +} + +void PgScrubber::message_all_replicas(int32_t opcode, std::string_view op_text) +{ + ceph_assert(m_pg->recovery_state.get_backfill_targets() + .empty()); // RRR ask: (the code was copied as is) Why checking here? + + std::vector> messages; + messages.reserve(m_pg->get_actingset().size()); + + epoch_t epch = get_osdmap_epoch(); + + for (auto& p : m_pg->get_actingset()) { + + if (p == m_pg_whoami) + continue; + + dout(10) << "scrub requesting " << op_text << " from osd." << p << " Epoch: " << epch + << dendl; + Message* m = new MOSDScrubReserve(spg_t(m_pg->info.pgid.pgid, p.shard), epch, opcode, + m_pg_whoami); + messages.push_back(std::make_pair(p.osd, m)); + } + + if (!messages.empty()) { + m_osds->send_message_osd_cluster(messages, epch); + } +} + +void PgScrubber::unreserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.reset(); +} + +[[nodiscard]] bool PgScrubber::scrub_process_inconsistent() +{ + dout(10) << __func__ << ": checking authoritative" << dendl; + + bool repair = state_test(PG_STATE_REPAIR); + const bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub")); + dout(20) << __func__ << " deep_scrub: " << deep_scrub << " m_is_deep: " << m_is_deep + << " repair: " << repair << dendl; + + // authoritative only store objects which are missing or inconsistent. + if (!m_authoritative.empty()) { + + stringstream ss; + ss << m_pg->info.pgid << " " << mode << " " << m_missing.size() << " missing, " + << m_inconsistent.size() << " inconsistent objects"; + dout(2) << ss.str() << dendl; + m_osds->clog->error(ss); + + if (repair) { + state_clear(PG_STATE_CLEAN); + + for (const auto& [hobj, shrd_list] : m_authoritative) { + + auto missing_entry = m_missing.find(hobj); + + if (missing_entry != m_missing.end()) { + m_pg->repair_object(hobj, shrd_list, missing_entry->second); + m_fixed_count += missing_entry->second.size(); + } + + if (m_inconsistent.count(hobj)) { + m_pg->repair_object(hobj, shrd_list, m_inconsistent[hobj]); + m_fixed_count += m_inconsistent[hobj].size(); + } + } + } + } + return (!m_authoritative.empty() && repair); +} + +/* + * note: only called for the Primary. + */ +void PgScrubber::scrub_finish() +{ + dout(10) << __func__ << " before flags: " << m_flags + << " deep_scrub_on_error: " << m_flags.deep_scrub_on_error << dendl; + + ceph_assert(m_pg->is_locked()); + + // if the repair request comes from auto-repair and large number of errors, + // we would like to cancel auto-repair + + bool repair = state_test(PG_STATE_REPAIR); + if (repair && m_flags.auto_repair && + m_authoritative.size() > m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + + dout(10) << __func__ << " undoing the repair" << dendl; + state_clear(PG_STATE_REPAIR); + repair = false; + } + + bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB); + const char* mode = (repair ? "repair" : (deep_scrub ? "deep-scrub" : "scrub")); + bool do_auto_scrub = false; + + // if a regular scrub had errors within the limit, do a deep scrub to auto repair + if (m_flags.deep_scrub_on_error && m_authoritative.size() && + m_authoritative.size() <= m_pg->cct->_conf->osd_scrub_auto_repair_num_errors) { + ceph_assert(!deep_scrub); + do_auto_scrub = true; + dout(15) << __func__ << " Try to auto repair after scrub errors" << dendl; + } + + m_flags.deep_scrub_on_error = false; + + // type-specific finish (can tally more errors) + _scrub_finish(); + + bool has_error = scrub_process_inconsistent(); + + { + stringstream oss; + oss << m_pg->info.pgid.pgid << " " << mode << " "; + int total_errors = m_shallow_errors + m_deep_errors; + if (total_errors) + oss << total_errors << " errors"; + else + oss << "ok"; + if (!deep_scrub && m_pg->info.stats.stats.sum.num_deep_scrub_errors) + oss << " ( " << m_pg->info.stats.stats.sum.num_deep_scrub_errors + << " remaining deep scrub error details lost)"; + if (repair) + oss << ", " << m_fixed_count << " fixed"; + if (total_errors) + m_osds->clog->error(oss); + else + m_osds->clog->debug(oss); + } + + // Since we don't know which errors were fixed, we can only clear them + // when every one has been fixed. + if (repair) { + if (m_fixed_count == m_shallow_errors + m_deep_errors) { + + ceph_assert(deep_scrub); + m_shallow_errors = 0; + m_deep_errors = 0; + dout(20) << __func__ << " All may be fixed" << dendl; + + } else if (has_error) { + + // Deep scrub in order to get corrected error counts + m_pg->scrub_after_recovery = true; + m_pg->m_planned_scrub.req_scrub = + m_pg->m_planned_scrub.req_scrub || m_flags.required; + + dout(20) << __func__ << " Current 'required': " << m_flags.required + << " Planned 'req_scrub': " << m_pg->m_planned_scrub.req_scrub << dendl; + + } else if (m_shallow_errors || m_deep_errors) { + + // We have errors but nothing can be fixed, so there is no repair + // possible. + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << __func__ << " " << (m_shallow_errors + m_deep_errors) + << " error(s) present with no repair possible" << dendl; + } + } + + { + // finish up + ObjectStore::Transaction t; + m_pg->recovery_state.update_stats( + [this, deep_scrub](auto& history, auto& stats) { + dout(10) << "m_pg->recovery_state.update_stats()" << dendl; + utime_t now = ceph_clock_now(); + history.last_scrub = m_pg->recovery_state.get_info().last_update; + history.last_scrub_stamp = now; + if (m_is_deep) { + history.last_deep_scrub = m_pg->recovery_state.get_info().last_update; + history.last_deep_scrub_stamp = now; + } + + if (deep_scrub) { + if ((m_shallow_errors == 0) && (m_deep_errors == 0)) + history.last_clean_scrub_stamp = now; + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + stats.stats.sum.num_deep_scrub_errors = m_deep_errors; + stats.stats.sum.num_large_omap_objects = m_omap_stats.large_omap_objects; + stats.stats.sum.num_omap_bytes = m_omap_stats.omap_bytes; + stats.stats.sum.num_omap_keys = m_omap_stats.omap_keys; + dout(10 /*25*/) << "scrub_finish shard " << m_pg_whoami + << " num_omap_bytes = " << stats.stats.sum.num_omap_bytes + << " num_omap_keys = " << stats.stats.sum.num_omap_keys + << dendl; + } else { + stats.stats.sum.num_shallow_scrub_errors = m_shallow_errors; + // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent + // because of deep-scrub errors + if (m_shallow_errors == 0) + history.last_clean_scrub_stamp = now; + } + stats.stats.sum.num_scrub_errors = stats.stats.sum.num_shallow_scrub_errors + + stats.stats.sum.num_deep_scrub_errors; + if (m_flags.check_repair) { + m_flags.check_repair = false; + if (m_pg->info.stats.stats.sum.num_scrub_errors) { + state_set(PG_STATE_FAILED_REPAIR); + dout(10) << "scrub_finish " << m_pg->info.stats.stats.sum.num_scrub_errors + << " error(s) still present after re-scrub" << dendl; + } + } + return true; + }, + &t); + int tr = m_osds->store->queue_transaction(m_pg->ch, std::move(t), nullptr); + ceph_assert(tr == 0); + + if (!m_pg->snap_trimq.empty()) { + dout(10) << "scrub finished, requeuing snap_trimmer" << dendl; + m_pg->snap_trimmer_scrub_complete(); + } + } + + if (has_error) { + m_pg->queue_peering_event(PGPeeringEventRef(std::make_shared( + get_osdmap_epoch(), get_osdmap_epoch(), PeeringState::DoRecovery()))); + } else { + state_clear(PG_STATE_REPAIR); + } + + cleanup_on_finish(); + if (do_auto_scrub) { + request_rescrubbing(m_pg->m_planned_scrub); + } + + if (m_pg->is_active() && m_pg->is_primary()) { + m_pg->recovery_state.share_pg_info(); + } +} + +Scrub::FsmNext PgScrubber::on_digest_updates() +{ + dout(10) << __func__ << " #pending: " << num_digest_updates_pending << " are we done? " + << num_digest_updates_pending + << (m_end.is_max() ? " " : " ") << dendl; + + if (num_digest_updates_pending == 0) { + + // got all updates, and finished with this chunk. Any more? + if (m_end.is_max()) { + scrub_finish(); + return Scrub::FsmNext::goto_notactive; + } else { + // go get a new chunk (via "requeue") + preemption_data.reset(); + return Scrub::FsmNext::next_chunk; + } + } else { + return Scrub::FsmNext::do_discard; + } +} + +/* + * note that the flags-set fetched from the PG (m_pg->m_planned_scrub) + * is cleared once scrubbing starts; Some of the values dumped here are + * thus transitory. + */ +void PgScrubber::dump(ceph::Formatter* f) const +{ + f->open_object_section("scrubber"); + f->dump_stream("epoch_start") << m_epoch_start; + f->dump_bool("active", m_active); + if (m_active) { + f->dump_stream("start") << m_start; + f->dump_stream("end") << m_end; + f->dump_stream("m_max_end") << m_max_end; + f->dump_stream("subset_last_update") << m_subset_last_update; + f->dump_bool("deep", m_is_deep); + f->dump_bool("must_scrub", (m_pg->m_planned_scrub.must_scrub || m_flags.required)); + f->dump_bool("must_deep_scrub", m_pg->m_planned_scrub.must_deep_scrub); + f->dump_bool("must_repair", m_pg->m_planned_scrub.must_repair); + f->dump_bool("need_auto", m_pg->m_planned_scrub.need_auto); + f->dump_bool("req_scrub", m_flags.required); + f->dump_bool("time_for_deep", m_pg->m_planned_scrub.time_for_deep); + f->dump_bool("auto_repair", m_flags.auto_repair); + f->dump_bool("check_repair", m_flags.check_repair); + f->dump_bool("deep_scrub_on_error", m_flags.deep_scrub_on_error); + f->dump_stream("scrub_reg_stamp") << m_scrub_reg_stamp; // utime_t + f->dump_unsigned("priority", m_flags.priority); + f->dump_int("shallow_errors", m_shallow_errors); + f->dump_int("deep_errors", m_deep_errors); + f->dump_int("fixed", m_fixed_count); + { + f->open_array_section("waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + } + f->close_section(); +} + + +void PgScrubber::handle_query_state(ceph::Formatter* f) +{ + dout(10) << __func__ << dendl; + + f->open_object_section("scrub"); + f->dump_stream("scrubber.epoch_start") << m_epoch_start; + f->dump_bool("scrubber.active", m_active); + f->dump_stream("scrubber.start") << m_start; + f->dump_stream("scrubber.end") << m_end; + f->dump_stream("scrubber.m_max_end") << m_max_end; + f->dump_stream("scrubber.m_subset_last_update") << m_subset_last_update; + f->dump_bool("scrubber.deep", m_is_deep); + { + f->open_array_section("scrubber.waiting_on_whom"); + for (const auto& p : m_maps_status.get_awaited()) { + f->dump_stream("shard") << p; + } + f->close_section(); + } + + f->dump_string("comment", "DEPRECATED - may be removed in the next release"); + + f->close_section(); +} + +PgScrubber::~PgScrubber() +{ + dout(10) << __func__ << dendl; +} + +PgScrubber::PgScrubber(PG* pg) + : m_pg{pg} + , m_pg_id{pg->pg_id} + , m_osds{m_pg->osd} + , m_pg_whoami{pg->pg_whoami} + , m_epoch_queued{0} + , preemption_data{pg} +{ + dout(20) << " creating PgScrubber for " << pg->pg_id << " / " << m_pg_whoami << dendl; + m_fsm = std::make_unique(m_pg, this); + m_fsm->initiate(); +} + +void PgScrubber::reserve_replicas() +{ + dout(10) << __func__ << dendl; + m_reservations.emplace(m_pg, m_pg_whoami); +} + +// called only for normal end-of-scrub, and only for a Primary +void PgScrubber::cleanup_on_finish() +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + m_pg->publish_stats_to_osd(); + + m_reservations.reset(); + m_local_osd_resource.reset(); + + m_pg->requeue_ops(m_pg->waiting_for_scrub); + + reset_internal_state(); + // type-specific state clear + _scrub_clear_state(); +} + +// uses process_event(), so must be invoked externally +void PgScrubber::scrub_clear_state(bool keep_repair_state) +{ + dout(10) << __func__ << dendl; + + clear_pgscrub_state(keep_repair_state); + m_fsm->process_event(FullReset{}); +} + +/* + * note: does not access the state-machine + */ +void PgScrubber::clear_pgscrub_state(bool keep_repair_state) +{ + dout(10) << __func__ << dendl; + ceph_assert(m_pg->is_locked()); + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + if (!keep_repair_state) + state_clear(PG_STATE_REPAIR); + + clear_scrub_reservations(); + m_pg->publish_stats_to_osd(); + + m_pg->requeue_ops(m_pg->waiting_for_scrub); + + reset_internal_state(); + + // type-specific state clear + _scrub_clear_state(); +} + +void PgScrubber::replica_handling_done() +{ + dout(10) << __func__ << dendl; + + state_clear(PG_STATE_SCRUBBING); + state_clear(PG_STATE_DEEP_SCRUB); + + // make sure we cleared the reservations! + + preemption_data.reset(); + m_maps_status.reset(); + m_received_maps.clear(); + + m_start = hobject_t{}; + m_end = hobject_t{}; + m_max_end = hobject_t{}; + m_subset_last_update = eversion_t{}; + m_shallow_errors = 0; + m_deep_errors = 0; + m_fixed_count = 0; + m_omap_stats = (const struct omap_stat_t){0}; + + run_callbacks(); + m_inconsistent.clear(); + m_missing.clear(); + m_authoritative.clear(); + num_digest_updates_pending = 0; + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos.reset(); + + m_cleaned_meta_map = ScrubMap{}; + m_needs_sleep = true; + m_sleep_started_at = utime_t{}; + + m_active = false; + m_pg->publish_stats_to_osd(); +} + +/* + * note: performs run_callbacks() + * note: reservations-related variables are not reset here + */ +void PgScrubber::reset_internal_state() +{ + dout(10) << __func__ << dendl; + + preemption_data.reset(); + m_maps_status.reset(); + m_received_maps.clear(); + + m_start = hobject_t{}; + m_end = hobject_t{}; + m_max_end = hobject_t{}; + m_subset_last_update = eversion_t{}; + m_shallow_errors = 0; + m_deep_errors = 0; + m_fixed_count = 0; + m_omap_stats = (const struct omap_stat_t){0}; + + run_callbacks(); + + m_inconsistent.clear(); + m_missing.clear(); + m_authoritative.clear(); + num_digest_updates_pending = 0; + m_primary_scrubmap = ScrubMap{}; + m_primary_scrubmap_pos.reset(); + replica_scrubmap = ScrubMap{}; + replica_scrubmap_pos.reset(); + m_cleaned_meta_map = ScrubMap{}; + m_needs_sleep = true; + m_sleep_started_at = utime_t{}; + + m_flags = scrub_flags_t{}; + + m_active = false; +} + +const OSDMapRef& PgScrubber::get_osdmap() const +{ + return m_pg->get_osdmap(); +} + +ostream& operator<<(ostream& out, const PgScrubber& scrubber) +{ + return out << scrubber.m_flags; +} + +ostream& PgScrubber::show(ostream& out) const +{ + return out << " [ " << m_pg_id << ": " << /*for now*/ m_flags << " ] "; +} + // ///////////////////// preemption_data_t ////////////////////////////////// PgScrubber::preemption_data_t::preemption_data_t(PG* pg) : m_pg{pg} diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h index 760c34310c6..0a390ce18e5 100644 --- a/src/osd/pg_scrubber.h +++ b/src/osd/pg_scrubber.h @@ -133,9 +133,458 @@ class MapsCollectionStatus { } // namespace Scrub -// an almost-empty PgScrubber for this commit: +/** + * the scrub operation flags. Primary only. + * Set at scrub start. Checked in multiple locations - mostly + * at finish. + */ +struct scrub_flags_t { + + unsigned int priority{0}; + + /** + * set by queue_scrub() if either planned_scrub.auto_repair or + * need_auto were set. + * Tested at scrub end. + */ + bool auto_repair{false}; + + /// this flag indicates that we are scrubbing post repair to verify everything is fixed + bool check_repair{false}; + + /// checked at the end of the scrub, to possibly initiate a deep-scrub + bool deep_scrub_on_error{false}; + + /** + * scrub must not be aborted. + * Set for explicitly requested scrubs, and for scrubs originated by the pairing + * process with the 'repair' flag set (in the RequestScrub event). + */ + bool required{false}; +}; + +ostream& operator<<(ostream& out, const scrub_flags_t& sf); + + +/** + * The part of PG-scrubbing code that isn't state-machine wiring. + * + * Why the separation? I wish to move to a different FSM implementation. Thus I + * am forced to strongly decouple the state-machine implementation details from + * the actual scrubbing code. + */ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { + public: + explicit PgScrubber(PG* pg); + + // ------------------ the I/F exposed to the PG (ScrubPgIF) ------------- + + /// are we waiting for resource reservation grants form our replicas? + [[nodiscard]] bool is_reserving() const final; + + void send_start_scrub() final; + + void send_start_after_repair() final; + + void send_scrub_resched() final; + + void active_pushes_notification() final; + + void update_applied_notification(epoch_t epoch_queued) final; + + void send_scrub_unblock() final; + + void digest_update_notification() final; + + void send_replica_maps_ready() final; + + void send_replica_pushes_upd() final; + + void reset_epoch(epoch_t epoch_queued) final; + + /** + * we allow some number of preemptions of the scrub, which mean we do + * not block. Then we start to block. Once we start blocking, we do + * not stop until the scrub range is completed. + */ + bool write_blocked_by_scrub(const hobject_t& soid) final; + + /// true if the given range intersects the scrub interval in any way + bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; + + void handle_scrub_reserve_request(OpRequestRef op) final; + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; + void handle_scrub_reserve_release(OpRequestRef op) final; + void clear_scrub_reservations() final; // PG::clear... fwds to here + void unreserve_replicas() final; + + // managing scrub op registration + + void reg_next_scrub(const requested_scrub_t& request_flags) final; + + void unreg_next_scrub() final; + + void scrub_requested(scrub_level_t scrub_level, + scrub_type_t scrub_type, + requested_scrub_t& req_flags) final; + + /** + * Reserve local scrub resources (managed by the OSD) + * + * Fails if OSD's local-scrubs budget was exhausted + * \returns were local resources reserved? + */ + bool reserve_local() final; + + void handle_query_state(ceph::Formatter* f) final; + + void dump(ceph::Formatter* f) const override; + + // used if we are a replica + + void replica_scrub_op(OpRequestRef op) final; + void replica_scrub(epoch_t epoch_queued) final; + void replica_scrub_resched(epoch_t epoch_queued) final; + + /// the op priority, taken from the primary's request message + Scrub::scrub_prio_t replica_op_priority() const final + { + return m_replica_request_priority; + }; + + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority, + unsigned int suggested_priority) const final; + /// the version that refers to m_flags.priority + unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final; + + void add_callback(Context* context) final { m_callbacks.push_back(context); } + + [[nodiscard]] bool are_callbacks_pending() const final // used for an assert in PG.cc + { + return !m_callbacks.empty(); + } + + /// handle a message carrying a replica map + void map_from_replica(OpRequestRef op) final; + + /** + * should we requeue blocked ops? + * Applicable to the PrimaryLogScrub derived class. + */ + [[nodiscard]] virtual bool should_requeue_blocked_ops( + eversion_t last_recovery_applied) const override + { + return false; + } + + void scrub_clear_state(bool keep_repair_state = false) final; + + /** + * add to scrub statistics, but only if the soid is below the scrub start + */ + virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats, + const hobject_t& soid) override + { + ceph_assert(false); + } + + /** + * finalize the parameters of the initiated scrubbing session: + * + * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set; + * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set. + */ + void set_op_parameters(requested_scrub_t& request) final; + + void cleanup_store(ObjectStore::Transaction* t) final; + + bool get_store_errors(const scrub_ls_arg_t& arg, + scrub_ls_result_t& res_inout) const override + { + return false; + }; + + // ------------------------------------------------------------------------------------------- + // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) + + bool select_range() final; + + /// walk the log to find the latest update that affects our chunk + eversion_t search_log_for_updates() const final; + + eversion_t get_last_update_applied() const final + { + return m_pg->recovery_state.get_last_update_applied(); + } + + void requeue_waiting() const final { m_pg->requeue_ops(m_pg->waiting_for_scrub); } + + int pending_active_pushes() const final { return m_pg->active_pushes; } + + void scrub_compare_maps() final; + + void on_init() final; + void on_replica_init() final; + void replica_handling_done() final; + + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services + /// (thus can be called from FSM reactions) + void clear_pgscrub_state(bool keep_repair_state) final; + + void add_delayed_scheduling() final; + + /** + * @returns have we asked at least one replica? + * 'false' means we are configured with no replicas, and + * should expect no maps to arrive. + */ + bool get_replicas_maps(bool replica_can_preempt) final; + + Scrub::FsmNext on_digest_updates() final; + + void send_replica_map(bool was_preempted) final; + + void send_remotes_reserved() final; + void send_reservation_failure() final; + + /** + * does the PG have newer updates than what we (the scrubber) know? + */ + [[nodiscard]] bool has_pg_marked_new_updates() const final; + + void set_subset_last_update(eversion_t e) final; + + void replica_update_start_epoch() final; + + void maps_compare_n_cleanup() final; + + Scrub::preemption_t* get_preemptor() final; + + int build_primary_map_chunk() final; + + int build_replica_map_chunk() final; + + void reserve_replicas() final; + + [[nodiscard]] bool was_epoch_changed() const final; + + void mark_local_map_ready() final; + + [[nodiscard]] bool are_all_maps_available() const final; + + std::string dump_awaited_maps() const final; + + protected: + bool state_test(uint64_t m) const { return m_pg->state_test(m); } + void state_set(uint64_t m) { m_pg->state_set(m); } + void state_clear(uint64_t m) { m_pg->state_clear(m); } + + [[nodiscard]] bool is_primary() const { return m_pg->recovery_state.is_primary(); } + + [[nodiscard]] bool is_scrub_registered() const; + + virtual void _scrub_clear_state() {} + + utime_t m_scrub_reg_stamp; ///< stamp we registered for + + ostream& show(ostream& out) const override; + + public: + // ------------------------------------------------------------------------------------------- + + friend ostream& operator<<(ostream& out, const PgScrubber& scrubber); + + static utime_t scrub_must_stamp() { return utime_t(1, 1); } + + virtual ~PgScrubber(); // must be defined separately, in the .cc file + + [[nodiscard]] bool is_scrub_active() const final; + + private: + void reset_internal_state(); + + void _scan_snaps(ScrubMap& smap); // note that the (non-standard for a + // non-virtual) name of the function is searched + // for by the QA standalone tests. Do not modify. + + void clean_meta_map(ScrubMap& for_meta_scrub); + + void run_callbacks(); + + /** + * are we still a clean & healthy scrubbing primary? + * + * relevant only after the initial sched_scrub + */ + [[nodiscard]] bool is_event_relevant(epoch_t queued) const; + + /** + * check the 'no scrub' configuration options. + */ + [[nodiscard]] bool should_abort_scrub(epoch_t queued) const; + + void send_epoch_changed(); + + /** + * return true if any inconsistency/missing is repaired, false otherwise + */ + [[nodiscard]] bool scrub_process_inconsistent(); + + bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always + ///< 'true', unless we just got out of a sleep period + + + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' + // to guarantee un-reserving when deleted. + std::optional m_reservations; + std::optional m_local_osd_resource; + + /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing + std::optional m_remote_osd_resource; + + void cleanup_on_finish(); // scrub_clear_state() as called for a Primary when + // Active->NotActive + + /// the part that actually finalizes a scrub + void scrub_finish(); + + utime_t m_sleep_started_at; + + protected: + PG* const m_pg; + + /** + * the derivative-specific scrub-finishing touches: + */ + virtual void _scrub_finish() {} + + /** + * Validate consistency of the object info and snap sets. + */ + virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest) + {} + + // common code used by build_primary_map_chunk() and build_replica_map_chunk(): + int build_scrub_map_chunk(ScrubMap& map, // primary or replica? + ScrubMapBuilder& pos, + hobject_t start, + hobject_t end, + bool deep); + + std::unique_ptr m_fsm; + const spg_t m_pg_id; ///< a local copy of m_pg->pg_id + OSDService* const m_osds; + const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; + + epoch_t m_epoch_start; ///< epoch when scrubbing was first scheduled + epoch_t m_epoch_queued; + scrub_flags_t m_flags; + + bool m_active{false}; + + eversion_t m_subset_last_update; + + std::unique_ptr m_store; + + int num_digest_updates_pending{0}; + hobject_t m_start, m_end; ///< note: half-closed: [start,end) + + /// Returns reference to current osdmap + const OSDMapRef& get_osdmap() const; + + /// Returns epoch of current osdmap + epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); } + + CephContext* get_pg_cct() const { return m_pg->cct; } + + void send_start_replica(); + + void send_sched_replica(); + + // collected statistics + int m_shallow_errors{0}; + int m_deep_errors{0}; + int m_fixed_count{0}; + + /// Maps from objects with errors to missing peers + HobjToShardSetMapping m_missing; + + private: + /** + * 'm_is_deep' - is the running scrub a deep one? + * + * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is + * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is + * meaningful both for the primary and the replicas, and is used as a parameter when + * building the scrub maps. + */ + bool m_is_deep{false}; + + inline static int fake_count{2}; // unit-tests. To be removed + + /** + * initiate a deep-scrub after the current scrub ended with errors. + */ + void request_rescrubbing(requested_scrub_t& req_flags); + + std::list m_callbacks; + + /** + * send a replica (un)reservation request to the acting set + * + * @param opcode - one of MOSDScrubReserve::REQUEST + * or MOSDScrubReserve::RELEASE + */ + void message_all_replicas(int32_t opcode, std::string_view op_text); + + hobject_t m_max_end; ///< Largest end that may have been sent to replicas + ScrubMap m_primary_scrubmap; + ScrubMapBuilder m_primary_scrubmap_pos; + + std::map m_received_maps; + + /// Cleaned std::map pending snap metadata scrub + ScrubMap m_cleaned_meta_map; + + void _request_scrub_map(pg_shard_t replica, + eversion_t version, + hobject_t start, + hobject_t end, + bool deep, + bool allow_preemption); + + + Scrub::MapsCollectionStatus m_maps_status; + + omap_stat_t m_omap_stats = (const struct omap_stat_t){0}; + + /// Maps from objects with errors to inconsistent peers + HobjToShardSetMapping m_inconsistent; + + /// Maps from object with errors to good peers + std::map>> m_authoritative; + + // ------------ members used if we are a replica + + epoch_t m_replica_epoch_start; + epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message + + ScrubMapBuilder replica_scrubmap_pos; /// \todo document + ScrubMap replica_scrubmap; /// \todo document + /** + * we mark the request priority as it arrived. It influences the queuing priority + * when we wait for local updates + */ + Scrub::scrub_prio_t m_replica_request_priority; + + /** + * Queue a XX event to be sent to the replica, to trigger a re-check of the + * availability of the scrub map prepared by the backend. + */ + void requeue_replica(Scrub::scrub_prio_t is_high_priority); + /** * the 'preemption' "state-machine". * Note: I was considering an orthogonal sub-machine implementation, but as @@ -223,4 +672,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { return m_left > 0; } }; + + preemption_data_t preemption_data; + + // debug/development temporary code: + void debug_dump_reservations(std::string_view header_txt) const; }; diff --git a/src/osd/scheduler/OpSchedulerItem.cc b/src/osd/scheduler/OpSchedulerItem.cc index 13c360b0323..3d6fb9aaac7 100644 --- a/src/osd/scheduler/OpSchedulerItem.cc +++ b/src/osd/scheduler/OpSchedulerItem.cc @@ -46,6 +46,30 @@ void PGSnapTrim::run( pg->unlock(); } +void PGScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAfterRepair::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->recovery_scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_scrub_resched(epoch_queued, handle); + pg->unlock(); +} + void PGScrubResourcesOK::run(OSD* osd, OSDShard* sdata, PGRef& pg, @@ -64,13 +88,72 @@ void PGScrubDenied::run(OSD* osd, pg->unlock(); } -void PGScrub::run( - OSD *osd, - OSDShard *sdata, - PGRef& pg, - ThreadPool::TPHandle &handle) +void PGScrubPushesUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) { - pg->scrub(epoch_queued, handle); + pg->scrub_send_pushes_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubAppliedUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_applied_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubUnblocked::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_unblocking(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubDigestUpdate::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_digest_update(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubGotReplMaps::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replmaps_ready(epoch_queued, handle); + pg->unlock(); +} + +void PGRepScrub::run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) +{ + pg->replica_scrub(epoch_queued, handle); + pg->unlock(); +} + +void PGRepScrubResched::run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->replica_scrub_resched(epoch_queued, handle); + pg->unlock(); +} + +void PGScrubReplicaPushes::run([[maybe_unused]] OSD* osd, + OSDShard* sdata, + PGRef& pg, + ThreadPool::TPHandle& handle) +{ + pg->scrub_send_replica_pushes(epoch_queued, handle); pg->unlock(); } diff --git a/src/osd/scheduler/OpSchedulerItem.h b/src/osd/scheduler/OpSchedulerItem.h index 6850c180a85..afa363e19e6 100644 --- a/src/osd/scheduler/OpSchedulerItem.h +++ b/src/osd/scheduler/OpSchedulerItem.h @@ -348,6 +348,14 @@ class PGScrubItem : public PGOpQueueable { } }; +class PGScrubResched : public PGScrubItem { + public: + PGScrubResched(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + /** * all replicas have granted our scrub resources request */ @@ -370,6 +378,87 @@ class PGScrubDenied : public PGScrubItem { void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; }; +/** + * called when a repair process completes, to initiate scrubbing. No local/remote + * resources are allocated. + */ +class PGScrubAfterRepair : public PGScrubItem { + public: + PGScrubAfterRepair(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAfterRepair"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubPushesUpdate : public PGScrubItem { + public: + PGScrubPushesUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubPushesUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubAppliedUpdate : public PGScrubItem { + public: + PGScrubAppliedUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubAppliedUpdate"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubUnblocked : public PGScrubItem { + public: + PGScrubUnblocked(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubUnblocked"} + {} + void run(OSD* osd, + OSDShard* sdata, + PGRef& pg, + [[maybe_unused]] ThreadPool::TPHandle& handle) final; +}; + +class PGScrubDigestUpdate : public PGScrubItem { + public: + PGScrubDigestUpdate(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubDigestUpdate"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubGotReplMaps : public PGScrubItem { + public: + PGScrubGotReplMaps(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubGotReplMaps"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrub : public PGScrubItem { + public: + PGRepScrub(spg_t pg, epoch_t epoch_queued) : PGScrubItem{pg, epoch_queued, "PGRepScrub"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGRepScrubResched : public PGScrubItem { + public: + PGRepScrubResched(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGRepScrubResched"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + +class PGScrubReplicaPushes : public PGScrubItem { + public: + PGScrubReplicaPushes(spg_t pg, epoch_t epoch_queued) + : PGScrubItem{pg, epoch_queued, "PGScrubReplicaPushes"} + {} + void run(OSD* osd, OSDShard* sdata, PGRef& pg, ThreadPool::TPHandle& handle) final; +}; + class PGRecovery : public PGOpQueueable { epoch_t epoch_queued; uint64_t reserved_pushes;