From c0c14a4361fc10e98f0f523898fce8c4672aa64a Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Wed, 2 Dec 2020 10:58:49 +0200 Subject: [PATCH] osd: modified handling of scrub unexpected events (following code-review comments) main changes: - incoming scrub events are now validated by the scrubber before being sent to the SFM: - messages from a previous intervals are discarded, possibly signalling a scrub of the current scrub and full scrubber reset; - abort newly mandated by a configuration change is acted upon; - stale messages from previous scrub sessions are discarded; - replica reservations are silently discarded on interval changes - some modifications to the state diagram Signed-off-by: Ronen Friedman --- src/osd/OSD.cc | 23 +- src/osd/OSD.h | 23 +- src/osd/PG.cc | 84 ++--- src/osd/PrimaryLogPG.cc | 5 +- src/osd/PrimaryLogScrub.cc | 8 +- src/osd/pg_scrubber.cc | 638 +++++++++++++++++++--------------- src/osd/pg_scrubber.h | 136 +++++--- src/osd/scrub_machine.cc | 207 ++++------- src/osd/scrub_machine.h | 70 ++-- src/osd/scrub_machine_lstnr.h | 13 +- src/osd/scrubber_common.h | 43 +-- 11 files changed, 641 insertions(+), 609 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d0d1f8a6097c2..8c60f5ea3b96a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1806,6 +1806,7 @@ void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priorit void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority) { + // Resulting scrub event: 'InternalSchedScrub' queue_scrub_event_msg(pg, with_priority); } @@ -7433,28 +7434,6 @@ bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) return pgid < rhs.pgid; } -// this one is only moved here (from the header) temporarily, for debugging: -void OSDService::unreg_pg_scrub(spg_t pgid, utime_t t) -{ - std::lock_guard l{OSDService::sched_scrub_lock}; - size_t removed = sched_scrub_pg.erase(ScrubJob{cct, pgid, t}); - ceph_assert(removed); - dout(10) << __func__ << " scrub-set removed: " << pgid << " T(" << t << ")" << dendl; -} - -// this one is only moved here (from the header) temporarily, for debugging: -utime_t OSDService::reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval, - double pool_scrub_max_interval, bool must) -{ - ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, - must); - std::lock_guard l(OSDService::sched_scrub_lock); - auto [x, inserted] = sched_scrub_pg.insert(scrub_job); - dout(10) << __func__ << " scrub-set inserted: " << pgid << " T(" << t << ")" << " must: " << must << " inserted " - << inserted << dendl; - return scrub_job.sched_time; -} - void OSDService::dumps_scrub(ceph::Formatter *f) { ceph_assert(f != nullptr); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index b64e35c707579..25d7359657dd1 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -291,10 +291,25 @@ public: }; std::set sched_scrub_pg; - /// @returns the scrub_reg_stamp used for unregister'ing the scrub job - utime_t reg_pg_scrub(spg_t pgid, utime_t t, double pool_scrub_min_interval, - double pool_scrub_max_interval, bool must); - void unreg_pg_scrub(spg_t pgid, utime_t t); + /// @returns the scrub_reg_stamp used for unregistering the scrub job + utime_t reg_pg_scrub(spg_t pgid, + utime_t t, + double pool_scrub_min_interval, + double pool_scrub_max_interval, + bool must) { + ScrubJob scrub_job(cct, pgid, t, pool_scrub_min_interval, pool_scrub_max_interval, + must); + std::lock_guard l(OSDService::sched_scrub_lock); + sched_scrub_pg.insert(scrub_job); + return scrub_job.sched_time; + } + + void unreg_pg_scrub(spg_t pgid, utime_t t) { + std::lock_guard l(sched_scrub_lock); + size_t removed = sched_scrub_pg.erase(ScrubJob(cct, pgid, t)); + ceph_assert(removed); + } + bool first_scrub_stamp(ScrubJob *out) { std::lock_guard l(sched_scrub_lock); if (sched_scrub_pg.empty()) diff --git a/src/osd/PG.cc b/src/osd/PG.cc index c06187b3106e9..2c82aac4dbda5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2064,80 +2064,65 @@ void PG::replica_scrub(OpRequestRef op, ThreadPool::TPHandle& handle) m_scrubber->replica_scrub_op(op); } -void PG::scrub(epoch_t queued, ThreadPool::TPHandle& handle) +void PG::scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle) { - dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl; + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; scrub_queued = false; + ceph_assert(is_primary()); + ceph_assert(!m_scrubber->is_scrub_active()); - if (pg_has_reset_since(queued)) { - dout(10) << " pg::scrub reset_since " << __func__ << " " << queued << dendl; - dout(10) << " pg::scrub reset_since " << __func__ << " " - << recovery_state.get_last_peering_reset() << dendl; - m_scrubber->scrub_clear_state(false); - return; - } + // a new scrub - ceph_assert( - is_primary()); // as the replica request should have reached PG::replica_scrub() + m_scrubber->reset_epoch(epoch_queued); - ceph_assert(!m_scrubber->is_scrub_active()); - // a new scrub - m_scrubber->reset_epoch(queued); - m_scrubber->send_start_scrub(); + // note: send_start_scrub() will verify 'epoch queued' against our current interval + m_scrubber->send_start_scrub(epoch_queued); } // note: no need to secure OSD resources for a recovery scrub void PG::recovery_scrub(epoch_t epoch_queued, ThreadPool::TPHandle& handle) { - dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued << dendl; + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; scrub_queued = false; - - if (pg_has_reset_since(epoch_queued)) { - dout(10) << " reset_since " << __func__ << " " << epoch_queued << dendl; - dout(10) << " reset_since " << __func__ << " " - << recovery_state.get_last_peering_reset() << dendl; - return; - } - ceph_assert(is_primary()); ceph_assert(!m_scrubber->is_scrub_active()); // a new scrub m_scrubber->reset_epoch(epoch_queued); - m_scrubber->send_start_after_repair(); + m_scrubber->send_start_after_repair(epoch_queued); } void PG::replica_scrub(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << "pg::" << __func__ << " queued at: " << epoch_queued + dout(10) << __func__ << " queued at: " << epoch_queued << (is_primary() ? " (primary)" : " (replica)") << dendl; scrub_queued = false; - m_scrubber->replica_scrub(epoch_queued); + m_scrubber->send_start_replica(epoch_queued); } void PG::scrub_send_scrub_resched(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << (is_primary() ? " (primary)" : " (replica)") << dendl; + dout(10) << __func__ << " queued at: " << epoch_queued << dendl; scrub_queued = false; - m_scrubber->send_scrub_resched(); + m_scrubber->send_scrub_resched(epoch_queued); } void PG::scrub_send_resources_granted(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - m_scrubber->send_remotes_reserved(); + m_scrubber->send_remotes_reserved(epoch_queued); } void PG::scrub_send_resources_denied(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - m_scrubber->send_reservation_failure(); + m_scrubber->send_reservation_failure(epoch_queued); } void PG::replica_scrub_resched(epoch_t epoch_queued, @@ -2145,64 +2130,49 @@ void PG::replica_scrub_resched(epoch_t epoch_queued, { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; scrub_queued = false; - m_scrubber->replica_scrub_resched(epoch_queued); + m_scrubber->send_sched_replica(epoch_queued); } void PG::scrub_send_pushes_update(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - if (pg_has_reset_since(epoch_queued)) { - dout(10) << __func__ << " been reset at " - << recovery_state.get_last_peering_reset() << dendl; - return; - } - m_scrubber->active_pushes_notification(); + m_scrubber->active_pushes_notification(epoch_queued); } void PG::scrub_send_replica_pushes(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - m_scrubber->send_replica_pushes_upd(); + dout(15) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_replica_pushes_upd(epoch_queued); } void PG::scrub_send_applied_update(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - if (pg_has_reset_since(epoch_queued)) { - dout(10) << __func__ << " been reset at " - << recovery_state.get_last_peering_reset() << dendl; - return; - } + dout(15) << __func__ << " queued at: " << epoch_queued << dendl; m_scrubber->update_applied_notification(epoch_queued); } void PG::scrub_send_unblocking(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - if (pg_has_reset_since(epoch_queued)) { - dout(10) << __func__ << " been reset at " - << recovery_state.get_last_peering_reset() << dendl; - return; - } - m_scrubber->send_scrub_unblock(); + dout(15) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_scrub_unblock(epoch_queued); } void PG::scrub_send_digest_update(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - m_scrubber->digest_update_notification(); + dout(15) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->digest_update_notification(epoch_queued); } void PG::scrub_send_replmaps_ready(epoch_t epoch_queued, [[maybe_unused]] ThreadPool::TPHandle& handle) { - dout(10) << __func__ << " queued at: " << epoch_queued << dendl; - m_scrubber->send_replica_maps_ready(); + dout(15) << __func__ << " queued at: " << epoch_queued << dendl; + m_scrubber->send_replica_maps_ready(epoch_queued); } bool PG::ops_blocked_by_scrub() const diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 2df61761b0dcb..8cb6495a51a33 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1601,12 +1601,13 @@ int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op) r = -EAGAIN; } else { bool store_queried = m_scrubber->get_store_errors(arg, result); - if (!store_queried) { + if (store_queried) { + encode(result, osd_op->outdata); + } else { // the scrubber's store is not initialized r = -ENOENT; } } - encode(result, osd_op->outdata); // RRR really? even if no store? return r; } diff --git a/src/osd/PrimaryLogScrub.cc b/src/osd/PrimaryLogScrub.cc index 6cafa256a5e23..39a16a3a3166f 100644 --- a/src/osd/PrimaryLogScrub.cc +++ b/src/osd/PrimaryLogScrub.cc @@ -554,6 +554,7 @@ void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap, } m_pl_pg->finish_ctx(ctx.get(), pg_log_entry_t::MODIFY); + ++num_digest_updates_pending; ctx->register_on_success([this]() { dout(20) << "updating scrub digest " << num_digest_updates_pending << dendl; if (--num_digest_updates_pending <= 0) { @@ -561,20 +562,17 @@ void PrimaryLogScrub::scrub_snapshot_metadata(ScrubMap& scrubmap, } }); - ++num_digest_updates_pending; m_pl_pg->simple_opc_submit(std::move(ctx)); } dout(10) << __func__ << " (" << mode << ") finish" << dendl; } -PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) - : PgScrubber{pg}, m_pl_pg{pg} -{} +PrimaryLogScrub::PrimaryLogScrub(PrimaryLogPG* pg) : PgScrubber{pg}, m_pl_pg{pg} {} void PrimaryLogScrub::_scrub_clear_state() { - dout(7) << __func__ << " - pg(" << m_pl_pg->pg_id << dendl; + dout(15) << __func__ << dendl; m_scrub_cstat = object_stat_collection_t(); } diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc index 2afa689aed4b7..8894106dca094 100644 --- a/src/osd/pg_scrubber.cc +++ b/src/osd/pg_scrubber.cc @@ -1,5 +1,5 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab +// -*- mode:C++; tab-width:2; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=2 sw=2 smarttab #include "pg_scrubber.h" @@ -72,123 +72,245 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf) return out; } -bool PgScrubber::is_event_relevant(epoch_t queued) const +// returns false if the message should be discarded. Handles the notification of interval +// change, if not done already. called only for active scrub? not sure. + +// let's first make this a Primary-only function +bool PgScrubber::check_interval(epoch_t epoch_to_verify) { - return is_primary() && m_pg->is_active() && m_pg->is_clean() && is_scrub_active() && - !was_epoch_changed() && (!queued || !m_pg->pg_has_reset_since(queued)); + const auto current_interval = m_pg->get_same_interval_since(); + + if (epoch_to_verify < current_interval) { + // the event will not be delivered. If we have already noticed and handled + // the change of seasons, it will be silently discarded. Otherwise - we + // reset the scrubber and its FSM. + dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. " + << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl; + + if (epoch_to_verify > m_last_dead_interval) { + + // we have not seen this interval change yet. + // The remote reservations are no longer relevant. + + m_last_dead_interval = current_interval; + + // clear the remote reservations. No need to send messages. + if (m_reservations) { + m_reservations->discard_all(); + } + + // stop the scrub and send a reset message to the FSM + scrub_clear_state(); + } + return false; + } + + return true; } -bool PgScrubber::should_abort_scrub(epoch_t queued) const +bool PgScrubber::check_interval_replica(epoch_t epoch_to_verify) { - dout(10) << __func__ << "(): queued:" << queued << " required: " << m_flags.required - << " noscrub: " << get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) << " / " - << m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB) << dendl; + const auto current_interval = m_pg->get_same_interval_since(); + + if (epoch_to_verify < current_interval) { + // the event will not be delivered. If we have already noticed and handled + // the change of seasons, it will be silently discarded. Otherwise - we + // reset the scrubber and its FSM. + dout(10) << __func__ << " stale message. epoch: " << epoch_to_verify << " vs. " + << current_interval << " (handled: " << m_last_dead_interval << ")" << dendl; - if (!is_primary() || !m_pg->is_active() || - (queued && m_pg->pg_has_reset_since(queued))) { + if (epoch_to_verify > m_last_dead_interval) { + + // we have not seen this interval change yet. + // The remote reservations are no longer relevant. + + m_last_dead_interval = current_interval; + + // clear the remote reservations. No need to send messages + m_remote_osd_resource.reset(); + + // stop the scrub and send a reset message to the FSM + // replica_handling_done(); + send_interval_changed(); + } + return false; + } + + // verify that we are reserved by the primary + // not true anymore (see rapair scrubs) ceph_assert(m_remote_osd_resource && + // m_remote_osd_resource->is_reserved()); + + return true; +} + +bool PgScrubber::is_message_relevant(epoch_t epoch_to_verify) +{ + if (!m_active) { + // not scrubbing. We can assume that the scrub was already terminated, and we + // can silently discard the incoming event. + return false; + } + + // is this a message from before we started this scrub? + if (epoch_to_verify < m_epoch_start) { + return false; + } + + // check for reasons to abort this scrub + + // has a new interval started? + if (!check_interval(epoch_to_verify)) { + // if this is a new interval, check_interval() just discarded + // remote resources and then killed the scrub + return false; + } + + ceph_assert(is_primary()); + + // were we instructed to abort? + return verify_against_abort(epoch_to_verify); +} + + +// false if the message was discarded because of an abort flag. +// Reset everything if the abort was not handled before. +bool PgScrubber::verify_against_abort(epoch_t epoch_to_verify) +{ + if (!should_abort()) { return true; } + dout(15) << __func__ << " aborting. incoming epoch: " << epoch_to_verify + << "vs last-aborted: " << m_last_aborted << dendl; + + // if we were not aware of the abort before - kill the scrub. + if (epoch_to_verify > m_last_aborted) { + scrub_clear_state(); + m_last_aborted = std::max(epoch_to_verify, m_epoch_start); + } + return false; +} + +bool PgScrubber::should_abort() const +{ if (m_flags.required) { return false; // not stopping 'required' scrubs for configuration changes } - if (state_test(PG_STATE_DEEP_SCRUB)) { + if (m_is_deep) { if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) || m_pg->pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) { dout(10) << "nodeep_scrub set, aborting" << dendl; return true; } - } else if (state_test(PG_STATE_SCRUBBING)) { - if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || - m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { - dout(10) << "noscrub set, aborting" << dendl; - return true; - } + } + + if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) || + m_pg->pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) { + dout(10) << "noscrub set, aborting" << dendl; + return true; } return false; } -void PgScrubber::send_start_scrub() +// sending (processing) state-machine events -------------------------------- + +/* + * a note re the checks performed before sending scrub-initiating messages: + * + * For those ('StartScrub', 'AfterRepairScrub') scrub-initiation messages that + * possibly were in the queue while the PG changed state and became unavailable for + * scrubbing: + * + * The check_interval() catches all major changes to the PG. As for the other conditions + * we may check (and see is_message_relevant() above): + * + * - we are not 'active' yet, so must check against is_active(), andL + * + * - the 'abort' flags were just verified (when the triggering message was queued). As + * those are only modified in human speeds - they need not be queried again. + * + * Some of the considerations above are also relevant to the replica-side initiation + * ('StartReplica' & 'StartReplicaNoWait'). + */ + + +void PgScrubber::send_start_scrub(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - if (should_abort_scrub(epoch_t(0))) { - dout(10) << __func__ << " aborting!" << dendl; - scrub_clear_state(false); - } else { + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(StartScrub{}); } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_start_after_repair() +void PgScrubber::send_start_after_repair(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(AfterRepairScrub{}); + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(AfterRepairScrub{}); + } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_scrub_unblock() +void PgScrubber::send_scrub_unblock(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - if (should_abort_scrub(epoch_t(0))) { - - dout(10) << __func__ << " aborting!" << dendl; - scrub_clear_state(false); - - } else if (is_scrub_active()) { - + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(Unblocked{}); - - } else { - dout(10) << __func__ << " ignored as scrub not active" << dendl; } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_scrub_resched() +void PgScrubber::send_scrub_resched(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - if (should_abort_scrub(epoch_t(0))) { - dout(10) << __func__ << " aborting!" << dendl; - scrub_clear_state(false); - } else if (is_scrub_active()) { + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(InternalSchedScrub{}); - } else { - // no need to send anything - dout(10) << __func__ << " event no longer relevant" << dendl; } - dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_start_replica() +void PgScrubber::send_start_replica(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(StartReplica{}); + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_primary()) { + // shouldn't happen. Ignore + dout(1) << "got a replica scrub request while Primary!" << dendl; + return; + } + if (check_interval_replica(epoch_queued)) { + m_fsm->my_states(); + // buy us some time by not waiting for updates if there are none + // to wait for. Affects the transition from NotActive into either + // ReplicaWaitUpdates or ActiveReplica. + if (pending_active_pushes()) + m_fsm->process_event(StartReplica{}); + else + m_fsm->process_event(StartReplicaNoWait{}); + } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_sched_replica() +void PgScrubber::send_sched_replica(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(SchedReplica{}); // retest for map availability + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval_replica(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(SchedReplica{}); // retest for map availability + } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::active_pushes_notification() +void PgScrubber::active_pushes_notification(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - if (should_abort_scrub(epoch_t(0))) { - dout(10) << __func__ << " aborting!" << dendl; - scrub_clear_state(false); - } else { + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(ActivePushesUpd{}); } @@ -197,82 +319,76 @@ void PgScrubber::active_pushes_notification() void PgScrubber::update_applied_notification(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << "() epoch: " << epoch_queued << dendl; - if (should_abort_scrub(epoch_queued)) { - dout(10) << __func__ << " aborting!" << dendl; - scrub_clear_state(false); - } else { + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { m_fsm->my_states(); m_fsm->process_event(UpdatesApplied{}); } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::digest_update_notification() +void PgScrubber::digest_update_notification(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - if (is_event_relevant(epoch_t(0))) { + // note: Primary only + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); m_fsm->process_event(DigestUpdate{}); - } else { - // no need to send anything - dout(10) << __func__ << " event no longer relevant" << dendl; } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_epoch_changed() +// no checks should be performed here +void PgScrubber::send_interval_changed() { dout(10) << "scrubber event -->> " << __func__ << dendl; - if (is_scrub_active()) { - m_fsm->my_states(); - m_fsm->process_event(EpochChanged{}); - } + m_fsm->my_states(); + m_fsm->process_event(IntervalChanged{}); dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_replica_maps_ready() +void PgScrubber::send_replica_maps_ready(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - if (is_scrub_active()) { + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (is_message_relevant(epoch_queued)) { + m_fsm->my_states(); m_fsm->process_event(GotReplicas{}); } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_replica_pushes_upd() +void PgScrubber::send_replica_pushes_upd(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - if (is_scrub_active()) { + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval_replica(epoch_queued)) { + m_fsm->my_states(); m_fsm->process_event(ReplicaPushesUpd{}); } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_remotes_reserved() +void PgScrubber::send_remotes_reserved(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(RemotesReserved{}); // note: too early to check for 'active'! + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + // note: scrub is not active yet + if (check_interval(epoch_queued)) { + m_fsm->my_states(); + m_fsm->process_event(RemotesReserved{}); + } dout(10) << "scrubber event --<< " << __func__ << dendl; } -void PgScrubber::send_reservation_failure() +void PgScrubber::send_reservation_failure(epoch_t epoch_queued) { - dout(10) << "scrubber event -->> " << __func__ << dendl; - m_fsm->my_states(); - m_fsm->process_event(ReservationFailure{}); // do not check for 'active'! + dout(10) << "scrubber event -->> " << __func__ << " epoch: " << epoch_queued << dendl; + if (check_interval(epoch_queued)) { // do not check for 'active'! + m_fsm->my_states(); + m_fsm->process_event(ReservationFailure{}); + } dout(10) << "scrubber event --<< " << __func__ << dendl; } -bool PgScrubber::is_scrub_active() const -{ - dout(10) << " " << __func__ << " actv? " << m_active << "pg:" << m_pg->pg_id << dendl; - return m_active; -} - bool PgScrubber::is_reserving() const { return m_fsm->is_reserving(); @@ -280,16 +396,11 @@ bool PgScrubber::is_reserving() const void PgScrubber::reset_epoch(epoch_t epoch_queued) { - dout(10) << __func__ << " PG( " << m_pg->pg_id - << (m_pg->is_primary() ? ") prm" : ") rpl") << " epoch: " << epoch_queued - << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; - - dout(10) << __func__ << " STATE_SCRUBBING? " << state_test(PG_STATE_SCRUBBING) << dendl; - m_epoch_queued = epoch_queued; - m_needs_sleep = true; - + dout(10) << __func__ << " state deep? " << state_test(PG_STATE_DEEP_SCRUB) << dendl; m_fsm->assert_not_active(); + m_epoch_start = epoch_queued; + m_needs_sleep = true; m_is_deep = state_test(PG_STATE_DEEP_SCRUB); } @@ -315,7 +426,7 @@ unsigned int PgScrubber::scrub_requeue_priority(Scrub::scrub_prio_t with_priorit } // ///////////////////////////////////////////////////////////////////// // -// scrub op registration handling +// scrub-op registration handling bool PgScrubber::is_scrub_registered() const { @@ -350,9 +461,9 @@ void PgScrubber::reg_next_scrub(const requested_scrub_t& request_flags) reg_stamp = m_pg->info.history.last_scrub_stamp; } - dout(9) << __func__ << " pg(" << m_pg_id << ") must: " << must - << " required:" << m_flags.required << " flags: " << request_flags - << " stamp: " << reg_stamp << dendl; + dout(15) << __func__ << " pg(" << m_pg_id << ") must: " << must + << " required:" << m_flags.required << " flags: " << request_flags + << " stamp: " << reg_stamp << dendl; // note down the sched_time, so we can locate this scrub, and remove it // later on. @@ -401,7 +512,7 @@ void PgScrubber::scrub_requested(scrub_level_t scrub_level, void PgScrubber::request_rescrubbing(requested_scrub_t& req_flags) { - dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << " ## " + dout(10) << __func__ << " existing-" << m_scrub_reg_stamp << ". was registered? " << is_scrub_registered() << dendl; unreg_next_scrub(); @@ -437,6 +548,7 @@ bool PgScrubber::has_pg_marked_new_updates() const void PgScrubber::set_subset_last_update(eversion_t e) { m_subset_last_update = e; + dout(15) << __func__ << " last-update: " << e << dendl; } /* @@ -618,9 +730,6 @@ void PgScrubber::add_delayed_scheduling() } } -/** - * walk the log to find the latest update that affects our chunk - */ eversion_t PgScrubber::search_log_for_updates() const { auto& projected = m_pg->projected_log.log; @@ -646,7 +755,8 @@ eversion_t PgScrubber::search_log_for_updates() const bool PgScrubber::get_replicas_maps(bool replica_can_preempt) { - dout(10) << __func__ << " epoch_start: " << m_epoch_start + dout(10) << __func__ << " started in epoch/interval: " << m_epoch_start << "/" + << m_interval_start << " pg same_interval_since: " << m_pg->info.history.same_interval_since << dendl; @@ -673,10 +783,10 @@ bool PgScrubber::get_replicas_maps(bool replica_can_preempt) bool PgScrubber::was_epoch_changed() const { // for crimson we have m_pg->get_info().history.same_interval_since - dout(10) << __func__ << " epoch_start: " << m_epoch_start + dout(10) << __func__ << " epoch_start: " << m_interval_start << " from pg: " << m_pg->get_history().same_interval_since << dendl; - return m_epoch_start < m_pg->get_history().same_interval_since; + return m_interval_start < m_pg->get_history().same_interval_since; } void PgScrubber::mark_local_map_ready() @@ -705,10 +815,10 @@ void PgScrubber::_request_scrub_map(pg_shard_t replica, dout(10) << __func__ << " scrubmap from osd." << replica << (deep ? " deep" : " shallow") << dendl; - auto repscrubop = new MOSDRepScrub( - spg_t(m_pg->info.pgid.pgid, replica.shard), version, m_pg->get_osdmap_epoch(), - m_pg->get_last_peering_reset(), start, end, deep, allow_preemption, m_flags.priority, - m_pg->ops_blocked_by_scrub()); + auto repscrubop = + new MOSDRepScrub(spg_t(m_pg->info.pgid.pgid, replica.shard), version, + get_osdmap_epoch(), m_pg->get_last_peering_reset(), start, end, deep, + allow_preemption, m_flags.priority, m_pg->ops_blocked_by_scrub()); // default priority. We want the replica-scrub processed prior to any recovery // or client io messages (we are holding a lock!) @@ -738,9 +848,10 @@ void PgScrubber::on_init() preemption_data.reset(); m_pg->publish_stats_to_osd(); - m_epoch_start = m_pg->get_history().same_interval_since; + m_interval_start = m_pg->get_history().same_interval_since; + // m_epoch_started = m_pg->get_osdmap_epoch(); - dout(10) << __func__ << " start same_interval:" << m_epoch_start << dendl; + dout(10) << __func__ << " start same_interval:" << m_interval_start << dendl; // create a new store { @@ -752,6 +863,7 @@ void PgScrubber::on_init() } m_start = m_pg->info.pgid.pgid.get_hobj_start(); + m_last_dead_interval = get_osdmap_epoch(); m_active = true; } @@ -759,6 +871,8 @@ void PgScrubber::on_replica_init() { ceph_assert(!m_active); m_active = true; + m_last_dead_interval = get_osdmap_epoch(); // so that check_interval_replica() won't + // kill a scrub for stale messages } void PgScrubber::_scan_snaps(ScrubMap& smap) @@ -766,9 +880,9 @@ void PgScrubber::_scan_snaps(ScrubMap& smap) hobject_t head; SnapSet snapset; - // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify - // caller using clean_meta_map(), and it works properly. - dout(15) << __func__ << " starts" << dendl; + // Test qa/standalone/scrub/osd-scrub-snaps.sh greps for the strings + // in this function + dout(15) << "_scan_snaps starts" << dendl; for (auto i = smap.objects.rbegin(); i != smap.objects.rend(); ++i) { @@ -866,16 +980,15 @@ int PgScrubber::build_primary_map_chunk() m_end, m_is_deep); if (ret == -EINPROGRESS) - m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::high_priority); + m_osds->queue_for_scrub_resched(m_pg, Scrub::scrub_prio_t::low_priority); return ret; } int PgScrubber::build_replica_map_chunk() { - dout(10) << __func__ << " epoch start: " << m_epoch_start << " ep q: " << m_epoch_queued - << dendl; - dout(10) << __func__ << " deep: " << m_is_deep << dendl; + dout(10) << __func__ << " interval start: " << m_interval_start + << " epoch: " << m_epoch_start << " deep: " << m_is_deep << dendl; auto ret = build_scrub_map_chunk(replica_scrubmap, replica_scrubmap_pos, m_start, m_end, m_is_deep); @@ -885,10 +998,9 @@ int PgScrubber::build_replica_map_chunk() // finished! // In case we restarted smaller chunk, clear old data - ScrubMap for_meta_scrub; m_cleaned_meta_map.clear_from(m_start); m_cleaned_meta_map.insert(replica_scrubmap); - clean_meta_map(for_meta_scrub); + auto for_meta_scrub = clean_meta_map(); _scan_snaps(for_meta_scrub); } @@ -949,13 +1061,18 @@ int PgScrubber::build_scrub_map_chunk( return 0; } -/** - * \todo describe what we are doing here +/* + * Process: + * Building a map of objects suitable for snapshot validation. + * The data in m_cleaned_meta_map is the left over partial items that need to + * be completed before they can be processed. * - * @param for_meta_scrub + * Snapshots in maps precede the head object, which is why we are scanning backwards. */ -void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub) +ScrubMap PgScrubber::clean_meta_map() { + ScrubMap for_meta_scrub; + if (m_end.is_max() || m_cleaned_meta_map.objects.empty()) { m_cleaned_meta_map.swap(for_meta_scrub); } else { @@ -976,6 +1093,8 @@ void PgScrubber::clean_meta_map(ScrubMap& for_meta_scrub) for_meta_scrub.objects.insert(begin, iter); m_cleaned_meta_map.objects.erase(begin, iter); } + + return for_meta_scrub; } void PgScrubber::run_callbacks() @@ -996,9 +1115,9 @@ void PgScrubber::maps_compare_n_cleanup() requeue_waiting(); } -Scrub::preemption_t* PgScrubber::get_preemptor() +Scrub::preemption_t& PgScrubber::get_preemptor() { - return &preemption_data; + return preemption_data; } void PgScrubber::requeue_replica(Scrub::scrub_prio_t is_high_priority) @@ -1018,23 +1137,29 @@ void PgScrubber::replica_scrub_op(OpRequestRef op) dout(10) << __func__ << " pg:" << m_pg->pg_id << " Msg: map_epoch:" << msg->map_epoch << " min_epoch:" << msg->min_epoch << " deep?" << msg->deep << dendl; + // are we still processing a previous scrub-map request without noticing that the + // interval changed? won't see it here, but rather at the reservation stage. + + if (msg->map_epoch < m_pg->info.history.same_interval_since) { dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch << " < " << m_pg->info.history.same_interval_since << dendl; + + // is there a general sync issue? are we holding a stale reservation? + // not checking now - assuming we will actively react to interval change. + return; } replica_scrubmap = ScrubMap{}; replica_scrubmap_pos = ScrubMapBuilder{}; - // m_replica_epoch_start is overwritten if requeued waiting for active pushes - m_replica_epoch_start = m_pg->info.history.same_interval_since; m_replica_min_epoch = msg->min_epoch; m_start = msg->start; m_end = msg->end; m_max_end = msg->end; m_is_deep = msg->deep; - m_epoch_start = m_pg->info.history.same_interval_since; + m_interval_start = m_pg->info.history.same_interval_since; m_replica_request_priority = msg->high_priority ? Scrub::scrub_prio_t::high_priority : Scrub::scrub_prio_t::low_priority; m_flags.priority = msg->priority ? msg->priority : m_pg->get_scrub_priority(); @@ -1050,55 +1175,14 @@ void PgScrubber::replica_scrub_op(OpRequestRef op) m_osds->queue_for_rep_scrub(m_pg, m_replica_request_priority, m_flags.priority); } -void PgScrubber::replica_scrub(epoch_t epoch_queued) -{ - dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued - << dendl; - dout(20) << __func__ << " m_epoch_start: " << m_epoch_start - << " better be >= " << m_pg->info.history.same_interval_since << dendl; - dout(20) << __func__ << " m_is_deep: " << m_is_deep << dendl; - - if (m_pg->pg_has_reset_since(epoch_queued)) { - dout(10) << "replica_scrub(epoch,) - reset!" << dendl; - send_epoch_changed(); - return; - } - - if (was_epoch_changed()) { - dout(10) << "replica_scrub(epoch,) - epoch!" << dendl; - send_epoch_changed(); - return; - } - ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check - - send_start_replica(); -} - -void PgScrubber::replica_scrub_resched(epoch_t epoch_queued) -{ - dout(10) << __func__ << ": " << m_pg->pg_id << " epoch queued: " << epoch_queued - << dendl; - - if (m_pg->pg_has_reset_since(epoch_queued)) { - dout(10) << "replica_scrub(epoch,) - reset!" << dendl; - send_epoch_changed(); - return; - } - - if (was_epoch_changed()) { - dout(10) << __func__ << " epoch changed!" << dendl; - send_epoch_changed(); - return; - } - ceph_assert(!is_primary()); // as should have been caught by the epoch-changed check - - send_sched_replica(); -} - void PgScrubber::set_op_parameters(requested_scrub_t& request) { dout(10) << __func__ << " input: " << request << dendl; + // write down the epoch of starting a new scrub. Will be used + // to discard stale messages from previous aborted scrubs. + m_epoch_start = m_pg->get_osdmap_epoch(); + m_flags.check_repair = request.check_repair; m_flags.auto_repair = request.auto_repair || request.need_auto; m_flags.required = request.req_scrub || request.must_scrub; @@ -1124,10 +1208,6 @@ void PgScrubber::set_op_parameters(requested_scrub_t& request) request = requested_scrub_t{}; } -/** - * RRR \todo ask why we collect from acting+recovery+backfill, but use the size of - * only the acting set - */ void PgScrubber::scrub_compare_maps() { dout(10) << __func__ << " has maps, analyzing" << dendl; @@ -1163,7 +1243,7 @@ void PgScrubber::scrub_compare_maps() m_osds->clog->warn(ss); } - if (m_pg->recovery_state.get_acting().size() > 1) { + if (m_pg->recovery_state.get_acting_recovery_backfill().size() > 1) { dout(10) << __func__ << " comparing replica scrub maps" << dendl; @@ -1202,8 +1282,7 @@ void PgScrubber::scrub_compare_maps() } } - ScrubMap for_meta_scrub; - clean_meta_map(for_meta_scrub); + auto for_meta_scrub = clean_meta_map(); // ok, do the pg-type specific scrubbing @@ -1227,38 +1306,30 @@ void PgScrubber::scrub_compare_maps() } } -void PgScrubber::replica_update_start_epoch() -{ - dout(10) << __func__ << " start:" << m_pg->info.history.same_interval_since << dendl; - m_replica_epoch_start = m_pg->info.history.same_interval_since; -} - /** * Send the requested map back to the primary (or - if we * were preempted - let the primary know). */ -void PgScrubber::send_replica_map(bool was_preempted) +void PgScrubber::send_replica_map(PreemptionNoted was_preempted) { - dout(10) << __func__ << " min epoch:" << m_replica_min_epoch - << " epoch_start:" << m_replica_epoch_start << dendl; + dout(10) << __func__ << " min epoch:" << m_replica_min_epoch << dendl; auto reply = new MOSDRepScrubMap(spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m_replica_min_epoch, m_pg_whoami); - reply->preempted = was_preempted; + reply->preempted = (was_preempted == PreemptionNoted::preempted); ::encode(replica_scrubmap, reply->get_data()); m_osds->send_message_osd_cluster(m_pg->get_primary().osd, reply, m_replica_min_epoch); } -/** +/* * - if the replica lets us know it was interrupted, we mark the chunk as interrupted. * The state-machine will react to that when all replica maps are received. * - when all maps are received, we signal the FSM with the GotReplicas event (see * scrub_send_replmaps_ready()). Note that due to the no-reentrancy limitations of the * FSM, we do not 'process' the event directly. Instead - it is queued for the OSD to - * handle (well - the incoming message is marked for fast dispatching, which is an - * even better reason for handling it via the queue). + * handle. */ void PgScrubber::map_from_replica(OpRequestRef op) { @@ -1276,8 +1347,14 @@ void PgScrubber::map_from_replica(OpRequestRef op) m_received_maps[m->from].decode(p, m_pg->info.pgid.pool()); dout(15) << "map version is " << m_received_maps[m->from].valid_through << dendl; - [[maybe_unused]] auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); - ceph_assert(is_ok); // and not an error message, following the original code + auto [is_ok, err_txt] = m_maps_status.mark_arriving_map(m->from); + if (!is_ok) { + // previously an unexpected map was triggering an assert. Now, as scrubs can be + // aborted at any time, the chances of this happening have increased, and aborting is + // not justified + dout(1) << __func__ << err_txt << " from OSD " << m->from << dendl; + return; + } if (m->preempted) { dout(10) << __func__ << " replica was preempted, setting flag" << dendl; @@ -1286,30 +1363,50 @@ void PgScrubber::map_from_replica(OpRequestRef op) } if (m_maps_status.are_all_maps_available()) { - dout(10) << __func__ << " osd-queuing GotReplicas" << dendl; + dout(15) << __func__ << " all repl-maps available" << dendl; m_osds->queue_scrub_got_repl_maps(m_pg, m_pg->is_scrub_blocking_ops()); } } -/** - * we are a replica being asked by the Primary to reserve OSD resources for - * scrubbing - */ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) { dout(10) << __func__ << " " << *op->get_req() << dendl; op->mark_started(); + auto request_ep = op->get_req()->get_map_epoch(); + + /* + * if we are currently holding a reservation, then: + * either (1) we, the scrubber, did not yet notice an interval change. The remembered + * reservation epoch is from before our interval, and we can silently discard the + * reservation (no message is required). + * or: + * (2) the interval hasn't changed, but the same Primary that (we think) holds the + * lock just sent us a new request. Note that we know it's the same Primary, as + * otherwise the interval would have changed. + * Ostensibly we can discard & redo the reservation. But then we + * will be temporarily releasing the OSD resource - and might not be able to grab it + * again. Thus we simple treat this as a successful new request. + */ + + if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) { + // we are holding a stale reservation from a past epoch + m_remote_osd_resource.reset(); + } - if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_reserved()) { - dout(10) << __func__ << " ignoring reserve request: Already reserved" << dendl; + if (request_ep < m_pg->get_same_interval_since()) { + // will not ack stale requests return; } bool granted{false}; + if (m_remote_osd_resource.has_value()) { - if (m_pg->cct->_conf->osd_scrub_during_recovery || !m_osds->is_recovery_active()) { + dout(10) << __func__ << " already reserved." << dendl; + granted = true; - m_remote_osd_resource.emplace(m_pg, m_osds); + } else if (m_pg->cct->_conf->osd_scrub_during_recovery || + !m_osds->is_recovery_active()) { + m_remote_osd_resource.emplace(m_pg, m_osds, request_ep); // OSD resources allocated? granted = m_remote_osd_resource->is_reserved(); if (!granted) { @@ -1321,9 +1418,8 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl; - auto m = op->get_req(); Message* reply = new MOSDScrubReserve( - spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), m->map_epoch, + spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard), request_ep, granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT, m_pg_whoami); m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection()); @@ -1337,7 +1433,8 @@ void PgScrubber::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) if (m_reservations.has_value()) { m_reservations->handle_reserve_grant(op, from); } else { - derr << __func__ << ": replica scrub reservations that will be leaked!" << dendl; + derr << __func__ << ": received unsolicited reservation grant from osd " << from + << " (" << op << ")" << dendl; } } @@ -1632,7 +1729,7 @@ Scrub::FsmNext PgScrubber::on_digest_updates() void PgScrubber::dump(ceph::Formatter* f) const { f->open_object_section("scrubber"); - f->dump_stream("epoch_start") << m_epoch_start; + f->dump_stream("epoch_start") << m_interval_start; f->dump_bool("active", m_active); if (m_active) { f->dump_stream("start") << m_start; @@ -1671,7 +1768,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f) dout(10) << __func__ << dendl; f->open_object_section("scrub"); - f->dump_stream("scrubber.epoch_start") << m_epoch_start; + f->dump_stream("scrubber.epoch_start") << m_interval_start; f->dump_bool("scrubber.active", m_active); f->dump_stream("scrubber.start") << m_start; f->dump_stream("scrubber.end") << m_end; @@ -1701,7 +1798,6 @@ PgScrubber::PgScrubber(PG* pg) , m_pg_id{pg->pg_id} , m_osds{m_pg->osd} , m_pg_whoami{pg->pg_whoami} - , m_epoch_queued{0} , preemption_data{pg} { dout(20) << " creating PgScrubber for " << pg->pg_id << " / " << m_pg_whoami << dendl; @@ -1715,7 +1811,6 @@ void PgScrubber::reserve_replicas() m_reservations.emplace(m_pg, m_pg_whoami); } -// called only for normal end-of-scrub, and only for a Primary void PgScrubber::cleanup_on_finish() { dout(10) << __func__ << dendl; @@ -1728,7 +1823,7 @@ void PgScrubber::cleanup_on_finish() m_reservations.reset(); m_local_osd_resource.reset(); - m_pg->requeue_ops(m_pg->waiting_for_scrub); + requeue_waiting(); reset_internal_state(); // type-specific state clear @@ -1736,31 +1831,31 @@ void PgScrubber::cleanup_on_finish() } // uses process_event(), so must be invoked externally -void PgScrubber::scrub_clear_state(bool keep_repair_state) +void PgScrubber::scrub_clear_state() { dout(10) << __func__ << dendl; - clear_pgscrub_state(keep_repair_state); + clear_pgscrub_state(); m_fsm->process_event(FullReset{}); } /* * note: does not access the state-machine */ -void PgScrubber::clear_pgscrub_state(bool keep_repair_state) +void PgScrubber::clear_pgscrub_state() { dout(10) << __func__ << dendl; ceph_assert(m_pg->is_locked()); state_clear(PG_STATE_SCRUBBING); state_clear(PG_STATE_DEEP_SCRUB); - if (!keep_repair_state) - state_clear(PG_STATE_REPAIR); + + state_clear(PG_STATE_REPAIR); clear_scrub_reservations(); m_pg->publish_stats_to_osd(); - m_pg->requeue_ops(m_pg->waiting_for_scrub); + requeue_waiting(); reset_internal_state(); @@ -1775,8 +1870,6 @@ void PgScrubber::replica_handling_done() state_clear(PG_STATE_SCRUBBING); state_clear(PG_STATE_DEEP_SCRUB); - // make sure we cleared the reservations! - preemption_data.reset(); m_maps_status.reset(); m_received_maps.clear(); @@ -1858,7 +1951,7 @@ ostream& operator<<(ostream& out, const PgScrubber& scrubber) ostream& PgScrubber::show(ostream& out) const { - return out << " [ " << m_pg_id << ": " << /*for now*/ m_flags << " ] "; + return out << " [ " << m_pg_id << ": " << m_flags << " ] "; } // ///////////////////// preemption_data_t ////////////////////////////////// @@ -1930,26 +2023,12 @@ void ReplicaReservations::send_reject() m_osds->queue_for_scrub_denied(m_pg, scrub_prio_t::low_priority); } -void ReplicaReservations::release_all() +void ReplicaReservations::discard_all() { dout(10) << __func__ << " " << m_reserved_peers << dendl; m_had_rejections = true; // preventing late-coming responses from triggering events - epoch_t epoch = m_pg->get_osdmap_epoch(); - - for (auto p : m_reserved_peers) { - release_replica(p, epoch); - } m_reserved_peers.clear(); - - // note: the release will follow on the heels of the request. When tried otherwise, - // grants that followed a reject arrived after the whole scrub machine-state was - // reset, causing leaked reservations. - if (m_pending) { - for (auto p : m_waited_for_peers) { - release_replica(p, epoch); - } - } m_waited_for_peers.clear(); } @@ -1960,7 +2039,22 @@ ReplicaReservations::~ReplicaReservations() // send un-reserve messages to all reserved replicas. We do not wait for answer (there // wouldn't be one). Other incoming messages will be discarded on the way, by our // owner. - release_all(); + dout(10) << __func__ << " " << m_reserved_peers << dendl; + + epoch_t epoch = m_pg->get_osdmap_epoch(); + + for (auto& p : m_reserved_peers) { + release_replica(p, epoch); + } + m_reserved_peers.clear(); + + // note: the release will follow on the heels of the request. When tried otherwise, + // grants that followed a reject arrived after the whole scrub machine-state was + // reset, causing leaked reservations. + for (auto& p : m_waited_for_peers) { + release_replica(p, epoch); + } + m_waited_for_peers.clear(); } /** @@ -2021,17 +2115,17 @@ void ReplicaReservations::handle_reserve_reject(OpRequestRef op, pg_shard_t from } else if (std::find(m_reserved_peers.begin(), m_reserved_peers.end(), from) != m_reserved_peers.end()) { - dout(15) << " already had osd." << from << " reserved" << dendl; + dout(10) << " already had osd." << from << " reserved" << dendl; } else { dout(10) << " osd." << from << " scrub reserve = fail" << dendl; m_had_rejections = true; // preventing any additional notifications - --m_pending; // not sure we need this bookkeeping anymore send_reject(); } } + // ///////////////////// LocalReservation ////////////////////////////////// LocalReservation::LocalReservation(PG* pg, OSDService* osds) @@ -2048,7 +2142,7 @@ LocalReservation::LocalReservation(PG* pg, OSDService* osds) m_holding_local_reservation = true; } -void LocalReservation::early_release() +LocalReservation::~LocalReservation() { if (m_holding_local_reservation) { m_holding_local_reservation = false; @@ -2057,17 +2151,11 @@ void LocalReservation::early_release() } } -LocalReservation::~LocalReservation() -{ - early_release(); -} - // ///////////////////// ReservedByRemotePrimary /////////////////////////////// -ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds) - : m_pg{pg} // holding the "whole PG" for dout() sake - , m_osds{osds} +ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch) + : m_pg{pg}, m_osds{osds}, m_reserved_at{epoch} { if (!m_osds->inc_scrubs_remote()) { dout(10) << __func__ << ": failed to reserve at Primary request" << dendl; @@ -2079,10 +2167,13 @@ ReservedByRemotePrimary::ReservedByRemotePrimary(PG* pg, OSDService* osds) m_reserved_by_remote_primary = true; } -void ReservedByRemotePrimary::early_release() +bool ReservedByRemotePrimary::is_stale() const +{ + return m_reserved_at < m_pg->get_same_interval_since(); +} + +ReservedByRemotePrimary::~ReservedByRemotePrimary() { - dout(20) << "ReservedByRemotePrimary::" << __func__ << ": " - << m_reserved_by_remote_primary << dendl; if (m_reserved_by_remote_primary) { m_reserved_by_remote_primary = false; m_osds->dec_scrubs_remote(); @@ -2090,11 +2181,6 @@ void ReservedByRemotePrimary::early_release() } } -ReservedByRemotePrimary::~ReservedByRemotePrimary() -{ - early_release(); -} - // ///////////////////// MapsCollectionStatus //////////////////////////////// auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from) @@ -2106,7 +2192,7 @@ auto MapsCollectionStatus::mark_arriving_map(pg_shard_t from) m_maps_awaited_for.erase(fe); return std::tuple{true, ""sv}; } else { - return std::tuple{false, "unsolicited scrub-map"sv}; + return std::tuple{false, " unsolicited scrub-map"sv}; } } @@ -2136,4 +2222,4 @@ ostream& operator<<(ostream& out, const MapsCollectionStatus& sf) return out << " ] "; } -} // namespace Scrub +} // namespace Scrub \ No newline at end of file diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h index e3936691578fc..f6536dba6ef0b 100644 --- a/src/osd/pg_scrubber.h +++ b/src/osd/pg_scrubber.h @@ -53,9 +53,15 @@ class ReplicaReservations { /// notify the scrubber that we have failed to reserve replicas' resources void send_reject(); - void release_all(); - public: + /** + * quietly discard all knowledge about existing reservations. No messages + * are sent to peers. + * To be used upon interval change, as we know the the running scrub is no longer + * relevant, and that the replicas had reset the reservations on their side. + */ + void discard_all(); + ReplicaReservations(PG* pg, pg_shard_t whoami); ~ReplicaReservations(); @@ -77,7 +83,6 @@ class LocalReservation { LocalReservation(PG* pg, OSDService* osds); ~LocalReservation(); bool is_reserved() const { return m_holding_local_reservation; } - void early_release(); }; /** @@ -87,12 +92,15 @@ class ReservedByRemotePrimary { PG* m_pg; OSDService* m_osds; bool m_reserved_by_remote_primary{false}; + const epoch_t m_reserved_at; public: - ReservedByRemotePrimary(PG* pg, OSDService* osds); + ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch); ~ReservedByRemotePrimary(); [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; } - void early_release(); + + /// compare the remembered reserved-at epoch to the current interval + [[nodiscard]] bool is_stale() const; }; /** @@ -183,23 +191,27 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { /// are we waiting for resource reservation grants form our replicas? [[nodiscard]] bool is_reserving() const final; - void send_start_scrub() final; + void send_start_scrub(epoch_t epoch_queued) final; - void send_start_after_repair() final; + void send_start_after_repair(epoch_t epoch_queued) final; - void send_scrub_resched() final; + void send_scrub_resched(epoch_t epoch_queued) final; - void active_pushes_notification() final; + void active_pushes_notification(epoch_t epoch_queued) final; void update_applied_notification(epoch_t epoch_queued) final; - void send_scrub_unblock() final; + void send_scrub_unblock(epoch_t epoch_queued) final; + + void digest_update_notification(epoch_t epoch_queued) final; - void digest_update_notification() final; + void send_replica_maps_ready(epoch_t epoch_queued) final; - void send_replica_maps_ready() final; + void send_start_replica(epoch_t epoch_queued) final; - void send_replica_pushes_upd() final; + void send_sched_replica(epoch_t epoch_queued) final; + + void send_replica_pushes_upd(epoch_t epoch_queued) final; void reset_epoch(epoch_t epoch_queued) final; @@ -213,7 +225,12 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { /// true if the given range intersects the scrub interval in any way bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final; + /** + * we are a replica being asked by the Primary to reserve OSD resources for + * scrubbing + */ void handle_scrub_reserve_request(OpRequestRef op) final; + void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final; void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final; void handle_scrub_reserve_release(OpRequestRef op) final; @@ -245,8 +262,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { // used if we are a replica void replica_scrub_op(OpRequestRef op) final; - void replica_scrub(epoch_t epoch_queued) final; - void replica_scrub_resched(epoch_t epoch_queued) final; /// the op priority, taken from the primary's request message Scrub::scrub_prio_t replica_op_priority() const final @@ -279,7 +294,7 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { return false; } - void scrub_clear_state(bool keep_repair_state = false) final; + void scrub_clear_state() final; /** * add to scrub statistics, but only if the soid is below the scrub start @@ -319,8 +334,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { return m_pg->recovery_state.get_last_update_applied(); } - void requeue_waiting() const final { m_pg->requeue_ops(m_pg->waiting_for_scrub); } - int pending_active_pushes() const final { return m_pg->active_pushes; } void scrub_compare_maps() final; @@ -331,7 +344,7 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { /// the version of 'scrub_clear_state()' that does not try to invoke FSM services /// (thus can be called from FSM reactions) - void clear_pgscrub_state(bool keep_repair_state) final; + void clear_pgscrub_state() final; void add_delayed_scheduling() final; @@ -344,10 +357,10 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { Scrub::FsmNext on_digest_updates() final; - void send_replica_map(bool was_preempted) final; + void send_replica_map(Scrub::PreemptionNoted was_preempted) final; - void send_remotes_reserved() final; - void send_reservation_failure() final; + void send_remotes_reserved(epoch_t epoch_queued) final; + void send_reservation_failure(epoch_t epoch_queued) final; /** * does the PG have newer updates than what we (the scrubber) know? @@ -356,11 +369,9 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { void set_subset_last_update(eversion_t e) final; - void replica_update_start_epoch() final; - void maps_compare_n_cleanup() final; - Scrub::preemption_t* get_preemptor() final; + Scrub::preemption_t& get_preemptor() final; int build_primary_map_chunk() final; @@ -400,32 +411,61 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { virtual ~PgScrubber(); // must be defined separately, in the .cc file - [[nodiscard]] bool is_scrub_active() const final; + [[nodiscard]] bool is_scrub_active() const final { return m_active; } private: void reset_internal_state(); - void _scan_snaps(ScrubMap& smap); // note that the (non-standard for a - // non-virtual) name of the function is searched - // for by the QA standalone tests. Do not modify. + void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); } - void clean_meta_map(ScrubMap& for_meta_scrub); + void _scan_snaps(ScrubMap& smap); + + ScrubMap clean_meta_map(); void run_callbacks(); + void send_interval_changed(); + + // ----- methods used to verify the relevance of incoming events: + /** - * are we still a clean & healthy scrubbing primary? + * is the incoming event still relevant, and should be processed? * - * relevant only after the initial sched_scrub + * It isn't if: + * - (1) we are no longer 'actively scrubbing'; or + * - (2) the message is from an epoch prior to when we started the current scrub + * session; or + * - (3) the message epoch is from a previous interval; or + * - (4) the 'abort' configuration flags were set. + * + * For (1) & (2) - teh incoming message is discarded, w/o further action. + * + * For (3): (see check_interval() for a full description) if we have not reacted yet + * to this specific new interval, we do now: + * - replica reservations are silently discarded (we count on the replicas to notice + * the interval change and un-reserve themselves); + * - the scrubbing is halted. + * + * For (4): the message will be discarded, but also: + * if this is the first time we've noticed the 'abort' request, we perform the abort. + * + * \returns should the incoming event be processed? */ - [[nodiscard]] bool is_event_relevant(epoch_t queued) const; + bool is_message_relevant(epoch_t epoch_to_verify); /** * check the 'no scrub' configuration options. */ - [[nodiscard]] bool should_abort_scrub(epoch_t queued) const; + [[nodiscard]] bool should_abort() const; + [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify); + + bool check_interval(epoch_t epoch_to_verify); + bool check_interval_replica(epoch_t epoch_to_verify); + + epoch_t m_last_dead_interval{}; + epoch_t m_last_aborted{}; // last time we've noticed a request to abort + - void send_epoch_changed(); /** * return true if any inconsistency/missing is repaired, false otherwise @@ -435,6 +475,8 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { bool m_needs_sleep{true}; ///< should we sleep before being rescheduled? always ///< 'true', unless we just got out of a sleep period + utime_t m_sleep_started_at; + // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed' // to guarantee un-reserving when deleted. @@ -450,8 +492,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { /// the part that actually finalizes a scrub void scrub_finish(); - utime_t m_sleep_started_at; - protected: PG* const m_pg; @@ -478,13 +518,18 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { OSDService* const m_osds; const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; - epoch_t m_epoch_start; ///< epoch when scrubbing was first scheduled - epoch_t m_epoch_queued; + epoch_t m_interval_start; ///< interval's 'from' of when scrubbing was first scheduled + /* + * the exact epoch when the scrubbing actually started (started here - cleared checks + * for no-scrub conf). Incoming events are verified against this, with stale events + * discarded. + */ + epoch_t m_epoch_start{0}; ///< the actual epoch when scrubbing started scrub_flags_t m_flags; bool m_active{false}; - eversion_t m_subset_last_update; + eversion_t m_subset_last_update{}; std::unique_ptr m_store; @@ -499,10 +544,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { CephContext* get_pg_cct() const { return m_pg->cct; } - void send_start_replica(); - - void send_sched_replica(); - // collected statistics int m_shallow_errors{0}; int m_deep_errors{0}; @@ -522,8 +563,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { */ bool m_is_deep{false}; - inline static int fake_count{2}; // unit-tests. To be removed - /** * initiate a deep-scrub after the current scrub ended with errors. */ @@ -568,11 +607,10 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { // ------------ members used if we are a replica - epoch_t m_replica_epoch_start; epoch_t m_replica_min_epoch; ///< the min epoch needed to handle this message - ScrubMapBuilder replica_scrubmap_pos; /// \todo document - ScrubMap replica_scrubmap; /// \todo document + ScrubMapBuilder replica_scrubmap_pos; + ScrubMap replica_scrubmap; /** * we mark the request priority as it arrived. It influences the queuing priority * when we wait for local updates diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc index 53c4427d489f6..f9c3a0495fa99 100644 --- a/src/osd/scrub_machine.cc +++ b/src/osd/scrub_machine.cc @@ -30,13 +30,11 @@ namespace Scrub { // --------- trace/debug auxiliaries ------------------------------- -// development code. To be removed void on_event_creation(std::string_view nm) { dout(20) << " scrubberFSM event: --vvvv---- " << nm << dendl; } -// development code. To be removed void on_event_discard(std::string_view nm) { dout(20) << " scrubberFSM event: --^^^^---- " << nm << dendl; @@ -74,12 +72,12 @@ template static ostream& _prefix(std::ostream* _dout, T* t) NotActive::NotActive(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> NotActive" << dendl; + dout(10) << "-- state -->> NotActive" << dendl; } -sc::result NotActive::react(const EpochChanged&) +sc::result NotActive::react(const IntervalChanged&) { - dout(15) << "NotActive::react(const EpochChanged&)" << dendl; + dout(15) << "NotActive::react(const IntervalChanged&)" << dendl; return discard_event(); } @@ -87,31 +85,18 @@ sc::result NotActive::react(const EpochChanged&) ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> ReservingReplicas" << dendl; + dout(10) << "-- state -->> ReservingReplicas" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases scrbr->reserve_replicas(); } -/** - * at least one replica denied us the scrub resources we've requested - */ sc::result ReservingReplicas::react(const ReservationFailure&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl; // the Scrubber must release all resources and abort the scrubbing - scrbr->clear_pgscrub_state(false); - return transit(); -} - -sc::result ReservingReplicas::react(const EpochChanged&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ReservingReplicas::react(const EpochChanged&)" << dendl; - - // the Scrubber must release all resources and abort the scrubbing - scrbr->clear_pgscrub_state(false); + scrbr->clear_pgscrub_state(); return transit(); } @@ -128,7 +113,7 @@ sc::result ReservingReplicas::react(const FullReset&) ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> ActiveScrubbing" << dendl; + dout(10) << "-- state -->> ActiveScrubbing" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases scrbr->on_init(); } @@ -143,25 +128,23 @@ ActiveScrubbing::~ActiveScrubbing() scrbr->unreserve_replicas(); } -void ScrubMachine::down_on_epoch_change(const EpochChanged&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << __func__ << dendl; - scrbr->unreserve_replicas(); -} - -void ScrubMachine::on_epoch_changed(const EpochChanged&) +/* + * The only source of an InternalError event as of now is the BuildMap state, + * when encountering a backend error. + * We kill the scrub and reset the FSM. + */ +sc::result ActiveScrubbing::react(const InternalError&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << __func__ << dendl; - // the Scrubber must release all resources and abort the scrubbing - scrbr->clear_pgscrub_state(false); + scrbr->clear_pgscrub_state(); + return transit(); } sc::result ActiveScrubbing::react(const FullReset&) { dout(10) << "ActiveScrubbing::react(const FullReset&)" << dendl; - // caller takes care of this: scrbr->clear_pgscrub_state(false); + // caller takes care of clearing the scrubber & FSM states return transit(); } @@ -173,7 +156,7 @@ sc::result ActiveScrubbing::react(const FullReset&) */ RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/RangeBlocked" << dendl; + dout(10) << "-- state -->> Act/RangeBlocked" << dendl; } // ----------------------- PendingTimer ----------------------------------- @@ -183,7 +166,7 @@ RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) */ PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/PendingTimer" << dendl; + dout(10) << "-- state -->> Act/PendingTimer" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases scrbr->add_delayed_scheduling(); @@ -198,10 +181,10 @@ PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) */ NewChunk::NewChunk(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/NewChunk" << dendl; + dout(10) << "-- state -->> Act/NewChunk" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->get_preemptor()->adjust_parameters(); + scrbr->get_preemptor().adjust_parameters(); // choose range to work on bool got_a_chunk = scrbr->select_range(); @@ -278,15 +261,7 @@ sc::result WaitLastUpdate::react(const InternalAllUpdates&) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&)" << dendl; - if (scrbr->was_epoch_changed()) { - dout(10) << "WaitLastUpdate: epoch!" << dendl; - post_event(boost::intrusive_ptr(new EpochChanged{})); - return discard_event(); - } - - dout(10) << "WaitLastUpdate::react(const InternalAllUpdates&) " - << scrbr->get_preemptor()->is_preemptable() << dendl; - scrbr->get_replicas_maps(scrbr->get_preemptor()->is_preemptable()); + scrbr->get_replicas_maps(scrbr->get_preemptor().is_preemptable()); return transit(); } @@ -296,14 +271,11 @@ BuildMap::BuildMap(my_context ctx) : my_base(ctx) { dout(10) << " -- state -->> Act/BuildMap" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(15) << __func__ << " same epoch? " << (scrbr->was_epoch_changed() ? "no" : "yes") - << dendl; - - if (scrbr->was_epoch_changed()) { - post_event(boost::intrusive_ptr(new EpochChanged{})); + // no need to check for an epoch change, as all possible flows that brought us here have + // an check_interval() verification of their final event. - } else if (scrbr->get_preemptor()->was_preempted()) { + if (scrbr->get_preemptor().was_preempted()) { // we were preempted, either directly or by a replica dout(10) << __func__ << " preempted!!!" << dendl; @@ -322,7 +294,7 @@ BuildMap::BuildMap(my_context ctx) : my_base(ctx) } else if (ret < 0) { dout(10) << "BuildMap::BuildMap() Error! Aborting. Ret: " << ret << dendl; - scrbr->mark_local_map_ready(); + // scrbr->mark_local_map_ready(); post_event(boost::intrusive_ptr(new InternalError{})); } else { @@ -346,7 +318,7 @@ sc::result BuildMap::react(const IntLocalMapDone&) DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/DrainReplMaps" << dendl; + dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; // we may have received all maps already. Send the event that will make us check. post_event(boost::intrusive_ptr(new GotReplicas{})); } @@ -361,7 +333,7 @@ sc::result DrainReplMaps::react(const GotReplicas&) return transit(); } - dout(10) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " + dout(15) << "DrainReplMaps::react(const GotReplicas&): still draining incoming maps: " << scrbr->dump_awaited_maps() << dendl; return discard_event(); } @@ -370,7 +342,7 @@ sc::result DrainReplMaps::react(const GotReplicas&) WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/WaitReplicas" << dendl; + dout(10) << "-- state -->> Act/WaitReplicas" << dendl; post_event(boost::intrusive_ptr(new GotReplicas{})); } @@ -383,7 +355,7 @@ sc::result WaitReplicas::react(const GotReplicas&) dout(10) << "WaitReplicas::react(const GotReplicas&) got all" << dendl; // were we preempted? - if (scrbr->get_preemptor()->disable_and_test()) { // a test&set + if (scrbr->get_preemptor().disable_and_test()) { // a test&set dout(10) << "WaitReplicas::react(const GotReplicas&) PREEMPTED!" << dendl; @@ -391,7 +363,6 @@ sc::result WaitReplicas::react(const GotReplicas&) } else { - dout(8) << "got the replicas!" << dendl; scrbr->maps_compare_n_cleanup(); return transit(); } @@ -404,7 +375,7 @@ sc::result WaitReplicas::react(const GotReplicas&) WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> Act/WaitDigestUpdate" << dendl; + dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; // perform an initial check: maybe we already // have all the updates we need: // (note that DigestUpdate is usually an external event) @@ -453,16 +424,18 @@ ScrubMachine::~ScrubMachine() ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> ReplicaWaitUpdates" << dendl; + dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - scrbr->on_replica_init(); - post_event(boost::intrusive_ptr(new ReplicaPushesUpd{})); } -sc::result ReplicaWaitUpdates::react(const EpochChanged&) +sc::result ReplicaWaitUpdates::react(const IntervalChanged&) { - dout(10) << "ReplicaWaitUpdates::react(const EpochChanged&)" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "ReplicaWaitUpdates::react(const IntervalChanged&)" << dendl; + + // note: the master's reservation of us was just discarded by our caller + scrbr->replica_handling_done(); return transit(); } @@ -474,27 +447,32 @@ sc::result ReplicaWaitUpdates::react(const ReplicaPushesUpd&) DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ReplicaWaitUpdates::react(const ReplicaPushesUpd&): " << scrbr->pending_active_pushes() << dendl; - dout(8) << "same epoch? " << !scrbr->was_epoch_changed() << dendl; - if (scrbr->was_epoch_changed()) { - - post_event(boost::intrusive_ptr(new EpochChanged{})); - - } else if (scrbr->pending_active_pushes() == 0) { + if (scrbr->pending_active_pushes() == 0) { // done waiting - scrbr->replica_update_start_epoch(); return transit(); } return discard_event(); } +/** + * the event poster is handling the scrubber reset + */ +sc::result ReplicaWaitUpdates::react(const FullReset&) +{ + dout(10) << "ReplicaWaitUpdates::react(const FullReset&)" << dendl; + return transit(); +} + // ----------------------- ActiveReplica ----------------------------------- ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) { - dout(10) << " -- state -->> ActiveReplica" << dendl; + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + dout(10) << "-- state -->> ActiveReplica" << dendl; + scrbr->on_replica_init(); // as we might have skipped ReplicaWaitUpdates post_event(boost::intrusive_ptr(new SchedReplica{})); } @@ -502,82 +480,52 @@ sc::result ActiveReplica::react(const SchedReplica&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "ActiveReplica::react(const SchedReplica&). is_preemptable? " - << scrbr->get_preemptor()->is_preemptable() << dendl; - - if (scrbr->was_epoch_changed()) { - - dout(10) << "epoch changed" << dendl; - post_event(boost::intrusive_ptr(new EpochChanged{})); - - } else if (scrbr->get_preemptor()->was_preempted()) { + << scrbr->get_preemptor().is_preemptable() << dendl; + if (scrbr->get_preemptor().was_preempted()) { dout(10) << "replica scrub job preempted" << dendl; - scrbr->send_replica_map(true); - post_event(boost::intrusive_ptr(new IntLocalMapDone{})); - - } else { - - // start or check progress of build_replica_map_chunk() - - auto ret = scrbr->build_replica_map_chunk(); - dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl; - - if (ret == -EINPROGRESS) { - - // must wait for the backend to finish. No external event source. - // build_replica_map_chunk() has already requeued a SchedReplica - // event. - - dout(20) << "waiting for the backend..." << dendl; - - } else if (ret < 0) { - - // the existing code ignores this option, treating an error - // report as a success. - /// \todo what should we do here? + scrbr->send_replica_map(PreemptionNoted::preempted); + scrbr->replica_handling_done(); + return transit(); + } - dout(1) << "Error! Aborting. ActiveReplica::react(const " - "SchedReplica&) Ret: " - << ret << dendl; - post_event(boost::intrusive_ptr(new IntLocalMapDone{})); + // start or check progress of build_replica_map_chunk() - } else { + auto ret = scrbr->build_replica_map_chunk(); + dout(15) << "ActiveReplica::react(const SchedReplica&) Ret: " << ret << dendl; - // the local map was created. Send it to the primary. + if (ret == -EINPROGRESS) { + // must wait for the backend to finish. No external event source. + // build_replica_map_chunk() has already requeued a SchedReplica + // event. - scrbr->send_replica_map(false); // 'false' == not preempted - post_event(boost::intrusive_ptr(new IntLocalMapDone{})); - } + dout(20) << "waiting for the backend..." << dendl; + return discard_event(); } - return discard_event(); -} -sc::result ActiveReplica::react(const IntLocalMapDone&) -{ - dout(10) << "ActiveReplica::react(const IntLocalMapDone&)" << dendl; - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - - scrbr->replica_handling_done(); - return transit(); -} + if (ret < 0) { + // the existing code ignores this option, treating an error + // report as a success. + dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: " << ret + << dendl; + scrbr->replica_handling_done(); + return transit(); + } -sc::result ActiveReplica::react(const InternalError&) -{ - DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(1) << "Error! Aborting." - << " ActiveReplica::react(const InternalError&) " << dendl; + // the local map was created. Send it to the primary. + scrbr->send_replica_map(PreemptionNoted::no_preemption); scrbr->replica_handling_done(); return transit(); } -sc::result ActiveReplica::react(const EpochChanged&) +sc::result ActiveReplica::react(const IntervalChanged&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases - dout(10) << "ActiveReplica::react(const EpochChanged&) " << dendl; + dout(10) << "ActiveReplica::react(const IntervalChanged&) " << dendl; - scrbr->send_replica_map(false); + scrbr->send_replica_map(PreemptionNoted::no_preemption); scrbr->replica_handling_done(); return transit(); } @@ -588,7 +536,6 @@ sc::result ActiveReplica::react(const EpochChanged&) sc::result ActiveReplica::react(const FullReset&) { dout(10) << "ActiveReplica::react(const FullReset&)" << dendl; - // caller takes care of this: scrbr->clear_pgscrub_state(false); return transit(); } diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h index ae1f7e812a1e6..7bbb7db37fe18 100644 --- a/src/osd/scrub_machine.h +++ b/src/osd/scrub_machine.h @@ -53,7 +53,9 @@ void on_event_discard(std::string_view nm); MEV(RemotesReserved) ///< all replicas have granted our reserve request MEV(ReservationFailure) ///< a reservation request has failed -MEV(EpochChanged) ///< ... from what it was when this chunk started + +MEV(IntervalChanged) ///< ... from what it was when this chunk started + MEV(StartScrub) ///< initiate a new scrubbing session (relevant if we are a Primary) MEV(AfterRepairScrub) ///< initiate a new scrubbing session. Only triggered at Recovery ///< completion. @@ -78,8 +80,10 @@ MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See ///< scrub_snapshot_metadata() MEV(AllChunksDone) -MEV(StartReplica) ///< initiating replica scrub. replica_scrub_op() -> OSD Q -> - ///< replica_scrub() +MEV(StartReplica) ///< initiating replica scrub. replica_scrub_op() -> OSD Q -> + ///< replica_scrub() +MEV(StartReplicaNoWait) ///< 'start replica' when there are no pending updates + MEV(SchedReplica) MEV(ReplicaPushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery ///< that is in-flight to the local ObjectStore @@ -105,17 +109,10 @@ class ScrubMachine : public sc::state_machine { PG* m_pg; // only used for dout messages spg_t m_pg_id; - ScrubMachineListener* m_scrbr; - void down_on_epoch_change(const EpochChanged&); - - void on_epoch_changed(const EpochChanged&); - void my_states() const; - void assert_not_active() const; - [[nodiscard]] bool is_reserving() const; }; @@ -127,37 +124,34 @@ class ScrubMachine : public sc::state_machine { * queued "PGScrub" op. * - a special end-of-recovery Primary scrub event ('AfterRepairScrub') that is * not required to reserve resources. - * - (for a replica) 'StartReplica', triggered by an incoming MOSDRepScrub msg. + * - (for a replica) 'StartReplica' or 'StartReplicaNoWait', triggered by an incoming + * MOSDRepScrub message. */ struct NotActive : sc::state { explicit NotActive(my_context ctx); - using reactions = mpl::list, + using reactions = mpl::list, sc::transition, // a scrubbing that was initiated at recovery completion, // and requires no resource reservations: sc::transition, sc::transition, - sc::custom_reaction>; + sc::transition>; - sc::result react(const EpochChanged&); - sc::result react(const sc::event_base&) // in the future: assert here - { - return discard_event(); - } + sc::result react(const IntervalChanged&); }; struct ReservingReplicas : sc::state { explicit ReservingReplicas(my_context ctx); - using reactions = mpl::list, + using reactions = mpl::list, // all replicas granted our resources request sc::transition, - sc::custom_reaction, sc::custom_reaction>; - sc::result react(const EpochChanged&); sc::result react(const FullReset&); + + /// at least one replica denied us the scrub resources we've requested sc::result react(const ReservationFailure&); }; @@ -184,14 +178,12 @@ struct ActiveScrubbing : sc::state // done scrubbing sc::transition, - sc::transition, + sc::custom_reaction, sc::custom_reaction>; sc::result react(const AllChunksDone&); sc::result react(const FullReset&); + sc::result react(const InternalError&); }; struct RangeBlocked : sc::state { @@ -250,13 +242,19 @@ struct WaitLastUpdate : sc::state { struct BuildMap : sc::state { explicit BuildMap(my_context ctx); + // possible error scenarios: + // - an error reported by the backend will trigger an 'InternalError' event, + // handled by our parent state; + // - if preempted, we switch to DrainReplMaps, where we will wait for all + // replicas to send their maps before acknowledging the preemption; + // - an interval change will be handled by the relevant 'send-event' functions, + // and will translated into a 'FullReset' event. using reactions = mpl::list, sc::transition, // looping, waiting // for the backend to // finish - sc::custom_reaction, - sc::transition>; // to discuss RRR + sc::custom_reaction>; sc::result react(const IntLocalMapDone&); }; @@ -301,27 +299,25 @@ struct WaitDigestUpdate : sc::state { */ struct ReplicaWaitUpdates : sc::state { explicit ReplicaWaitUpdates(my_context ctx); - using reactions = - mpl::list, sc::custom_reaction>; + using reactions = mpl::list, + sc::custom_reaction, + sc::custom_reaction>; sc::result react(const ReplicaPushesUpd&); - sc::result react(const EpochChanged&); + sc::result react(const IntervalChanged&); + sc::result react(const FullReset&); }; struct ActiveReplica : sc::state { explicit ActiveReplica(my_context ctx); - using reactions = mpl::list, + using reactions = mpl::list, sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction, - sc::custom_reaction>; + sc::custom_reaction>; sc::result react(const SchedReplica&); - sc::result react(const EpochChanged&); - sc::result react(const IntLocalMapDone&); + sc::result react(const IntervalChanged&); sc::result react(const FullReset&); - sc::result react(const InternalError&); }; } // namespace Scrub diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h index cfaca4b10f259..2b96161215474 100644 --- a/src/osd/scrub_machine_lstnr.h +++ b/src/osd/scrub_machine_lstnr.h @@ -15,6 +15,7 @@ namespace Scrub { /// used when PgScrubber is called by the scrub-machine, to tell the FSM /// how to continue enum class FsmNext { do_discard, next_chunk, goto_notactive }; +enum class PreemptionNoted { no_preemption, preempted }; /// the interface exposed by the PgScrubber into its internal /// preemption_data object @@ -55,8 +56,6 @@ struct ScrubMachineListener { virtual eversion_t get_last_update_applied() const = 0; - virtual void requeue_waiting() const = 0; - virtual int pending_active_pushes() const = 0; virtual int build_primary_map_chunk() = 0; @@ -71,9 +70,11 @@ struct ScrubMachineListener { virtual void replica_handling_done() = 0; + // no virtual void discard_reservation_by_primary() = 0; + /// the version of 'scrub_clear_state()' that does not try to invoke FSM services /// (thus can be called from FSM reactions) - virtual void clear_pgscrub_state(bool keep_repair_state) = 0; + virtual void clear_pgscrub_state() = 0; virtual void add_delayed_scheduling() = 0; @@ -86,9 +87,7 @@ struct ScrubMachineListener { virtual Scrub::FsmNext on_digest_updates() = 0; - virtual void send_replica_map(bool was_preempted) = 0; - - virtual void replica_update_start_epoch() = 0; + virtual void send_replica_map(Scrub::PreemptionNoted was_preempted) = 0; [[nodiscard]] virtual bool has_pg_marked_new_updates() const = 0; @@ -96,7 +95,7 @@ struct ScrubMachineListener { [[nodiscard]] virtual bool was_epoch_changed() const = 0; - virtual Scrub::preemption_t* get_preemptor() = 0; + virtual Scrub::preemption_t& get_preemptor() = 0; /** * a "technical" collection of the steps performed once all diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index d736319af1be4..510c428546948 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -116,25 +116,27 @@ struct ScrubPgIF { // --------------- triggering state-machine events: - virtual void send_start_scrub() = 0; + virtual void send_start_scrub(epoch_t epoch_queued) = 0; - virtual void send_start_after_repair() = 0; + virtual void send_start_after_repair(epoch_t epoch_queued) = 0; - virtual void send_scrub_resched() = 0; + virtual void send_scrub_resched(epoch_t epoch_queued) = 0; - virtual void replica_scrub_resched(epoch_t epoch_queued) = 0; - - virtual void active_pushes_notification() = 0; + virtual void active_pushes_notification(epoch_t epoch_queued) = 0; virtual void update_applied_notification(epoch_t epoch_queued) = 0; - virtual void digest_update_notification() = 0; + virtual void digest_update_notification(epoch_t epoch_queued) = 0; + + virtual void send_scrub_unblock(epoch_t epoch_queued) = 0; + + virtual void send_replica_maps_ready(epoch_t epoch_queued) = 0; - virtual void send_scrub_unblock() = 0; + virtual void send_replica_pushes_upd(epoch_t epoch_queued) = 0; - virtual void send_replica_maps_ready() = 0; + virtual void send_start_replica(epoch_t epoch_queued) = 0; - virtual void send_replica_pushes_upd() = 0; + virtual void send_sched_replica(epoch_t epoch_queued) = 0; // -------------------------------------------------- @@ -158,24 +160,25 @@ struct ScrubPgIF { virtual void replica_scrub_op(OpRequestRef op) = 0; - virtual void replica_scrub(epoch_t epoch_queued) = 0; - virtual void set_op_parameters(requested_scrub_t&) = 0; - virtual void scrub_clear_state(bool keep_repair_state = false) = 0; + virtual void scrub_clear_state() = 0; virtual void handle_query_state(ceph::Formatter* f) = 0; virtual void dump(ceph::Formatter* f) const = 0; /** - * we allow some number of preemptions of the scrub, which mean we do - * not block. Then we start to block. Once we start blocking, we do - * not stop until the scrub range is completed. + * Return true if soid is currently being scrubbed and pending IOs should block. + * May have a side effect of preempting an in-progress scrub -- will return false + * in that case. + * + * @param soid object to check for ongoing scrub + * @return boolean whether a request on soid should block until scrub completion */ virtual bool write_blocked_by_scrub(const hobject_t& soid) = 0; - /// true if the given range intersects the scrub interval in any way + /// Returns whether any objects in the range [begin, end] are being scrubbed virtual bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) = 0; /// the op priority, taken from the primary's request message @@ -201,19 +204,19 @@ struct ScrubPgIF { * the version of 'scrub_clear_state()' that does not try to invoke FSM services * (thus can be called from FSM reactions) */ - virtual void clear_pgscrub_state(bool keep_repair_state) = 0; + virtual void clear_pgscrub_state() = 0; /** * triggers the 'RemotesReserved' (all replicas granted scrub resources) * state-machine event */ - virtual void send_remotes_reserved() = 0; + virtual void send_remotes_reserved(epoch_t epoch_queued) = 0; /** * triggers the 'ReservationFailure' (at least one replica denied us the requested * resources) state-machine event */ - virtual void send_reservation_failure() = 0; + virtual void send_reservation_failure(epoch_t epoch_queued) = 0; virtual void cleanup_store(ObjectStore::Transaction* t) = 0; -- 2.39.5