From: Ronen Friedman Date: Sun, 8 Aug 2021 14:14:55 +0000 (+0000) Subject: osd/scrub: only telling the scrubber of 'updates' events if these events are waited for X-Git-Tag: v17.1.0~1112^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0aed4a5dd63b51288f1e01ca73d9e3632a1238be;p=ceph.git osd/scrub: only telling the scrubber of 'updates' events if these events are waited for Only the Primary, and only when there is an active scrubbing going on and the scrubber state machine is in WaitUpdates states, should be notified of 'updates'. The extra messages we queued and processed earlier, apart from creating redundant log lines and wasting CPU time, were increasing the chance of a race in the handling of stale scrubs. Signed-off-by: Ronen Friedman --- diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h index 5b9eaae18545..ededcdca8c94 100644 --- a/src/messages/MOSDRepScrub.h +++ b/src/messages/MOSDRepScrub.h @@ -29,7 +29,7 @@ public: spg_t pgid; // PG to scrub eversion_t scrub_from; // only scrub log entries after scrub_from - eversion_t scrub_to; // last_update_applied when message sent + eversion_t scrub_to; // last_update_applied when message sent (not used) epoch_t map_epoch = 0, min_epoch = 0; bool chunky; // true for chunky scrubs hobject_t start; // lower bound of scrub, inclusive diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 64e596b107ac..024ea91f4cf7 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -11278,8 +11278,10 @@ void PrimaryLogPG::op_applied(const eversion_t &applied_version) ceph_assert(applied_version <= info.last_update); recovery_state.local_write_applied(applied_version); - if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) { - osd->queue_scrub_applied_update(this, is_scrub_blocking_ops()); + if (is_primary() && m_scrubber) { + // if there's a scrub operation waiting for the selected chunk to be fully updated - + // allow it to continue + m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied()); } } diff --git a/src/osd/pg_scrubber.cc b/src/osd/pg_scrubber.cc index 0a98f5addc0e..0fc96cfca245 100644 --- a/src/osd/pg_scrubber.cc +++ b/src/osd/pg_scrubber.cc @@ -558,6 +558,17 @@ void PgScrubber::set_subset_last_update(eversion_t e) dout(15) << __func__ << " last-update: " << e << dendl; } +void PgScrubber::on_applied_when_primary(const eversion_t& applied_version) +{ + // we are only interested in updates if we are the Primary, and in state + // WaitLastUpdate + if (m_fsm->is_accepting_updates() && (applied_version >= m_subset_last_update)) { + m_osds->queue_scrub_applied_update(m_pg, m_pg->is_scrub_blocking_ops()); + dout(15) << __func__ << " update: " << applied_version + << " vs. required: " << m_subset_last_update << dendl; + } +} + /* * The selected range is set directly into 'm_start' and 'm_end' * setting: @@ -1111,6 +1122,7 @@ int PgScrubber::build_scrub_map_chunk( while (!pos.done()) { int r = m_pg->get_pgbackend()->be_scan_list(map, pos); + dout(30) << __func__ << " BE returned " << r << dendl; if (r == -EINPROGRESS) { dout(20) << __func__ << " in progress" << dendl; return r; @@ -1201,7 +1213,6 @@ void PgScrubber::replica_scrub_op(OpRequestRef op) // are we still processing a previous scrub-map request without noticing that the // interval changed? won't see it here, but rather at the reservation stage. - if (msg->map_epoch < m_pg->info.history.same_interval_since) { dout(10) << "replica_scrub_op discarding old replica_scrub from " << msg->map_epoch << " < " << m_pg->info.history.same_interval_since << dendl; @@ -1465,12 +1476,13 @@ void PgScrubber::handle_scrub_reserve_request(OpRequestRef op) * otherwise the interval would have changed. * Ostensibly we can discard & redo the reservation. But then we * will be temporarily releasing the OSD resource - and might not be able to grab it - * again. Thus we simple treat this as a successful new request. + * again. Thus, we simply treat this as a successful new request. */ if (m_remote_osd_resource.has_value() && m_remote_osd_resource->is_stale()) { // we are holding a stale reservation from a past epoch m_remote_osd_resource.reset(); + dout(10) << __func__ << " stale reservation request" << dendl; } if (request_ep < m_pg->get_same_interval_since()) { diff --git a/src/osd/pg_scrubber.h b/src/osd/pg_scrubber.h index 0145ae6515f2..fdee52e6f353 100644 --- a/src/osd/pg_scrubber.h +++ b/src/osd/pg_scrubber.h @@ -214,6 +214,13 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { void send_sched_replica(epoch_t epoch_queued) final; void send_replica_pushes_upd(epoch_t epoch_queued) final; + /** + * The PG has updated its 'applied version'. It might be that we are waiting for this + * information: after selecting a range of objects to scrub, we've marked the latest + * version of these objects in m_subset_last_update. We will not start the map building + * before we know that the PG has reached this version. + */ + void on_applied_when_primary(const eversion_t& applied_version) final; void send_full_reset(epoch_t epoch_queued) final; @@ -345,6 +352,8 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { // ------------------------------------------------------------------------------------------- // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener) + [[nodiscard]] bool is_primary() const final { return m_pg->recovery_state.is_primary(); } + void select_range_n_notify() final; Scrub::BlockedRangeWarning acquire_blocked_alarm() final; @@ -417,8 +426,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener { void state_set(uint64_t m) { m_pg->state_set(m); } void state_clear(uint64_t m) { m_pg->state_clear(m); } - [[nodiscard]] bool is_primary() const { return m_pg->recovery_state.is_primary(); } - [[nodiscard]] bool is_scrub_registered() const; virtual void _scrub_clear_state() {} diff --git a/src/osd/scrub_machine.cc b/src/osd/scrub_machine.cc index d34d00446d7f..edee613ffa0f 100644 --- a/src/osd/scrub_machine.cc +++ b/src/osd/scrub_machine.cc @@ -60,6 +60,14 @@ bool ScrubMachine::is_reserving() const return state_cast(); } +bool ScrubMachine::is_accepting_updates() const +{ + DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases + ceph_assert(scrbr->is_primary()); + + return state_cast(); +} + // for the rest of the code in this file - we know what PG we are dealing with: #undef dout_prefix #define dout_prefix _prefix(_dout, this->context().m_pg) @@ -242,6 +250,15 @@ WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) post_event(UpdatesApplied{}); } +/** + * Note: + * Updates are locally readable immediately. Thus, on the replicas we do need + * to wait for the update notifications before scrubbing. For the Primary it's + * a bit different: on EC (and only there) rmw operations have an additional + * read roundtrip. That means that on the Primary we need to wait for + * last_update_applied (the replica side, even on EC, is still safe + * since the actual transaction will already be readable by commit time. + */ void WaitLastUpdate::on_new_updates(const UpdatesApplied&) { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases diff --git a/src/osd/scrub_machine.h b/src/osd/scrub_machine.h index 4c8b0d4121bc..dd44f1061528 100644 --- a/src/osd/scrub_machine.h +++ b/src/osd/scrub_machine.h @@ -74,7 +74,7 @@ MEV(ChunkIsBusy) MEV(ActivePushesUpd) ///< Update to active_pushes. 'active_pushes' represents recovery ///< that is in-flight to the local ObjectStore -MEV(UpdatesApplied) // external +MEV(UpdatesApplied) ///< (Primary only) all updates are committed MEV(InternalAllUpdates) ///< the internal counterpart of UpdatesApplied @@ -90,8 +90,6 @@ MEV(IntLocalMapDone) MEV(DigestUpdate) ///< external. called upon success of a MODIFY op. See ///< scrub_snapshot_metadata() -MEV(AllChunksDone) - MEV(MapsCompared) ///< (Crimson) maps_compare_n_cleanup() transactions are done MEV(StartReplica) ///< initiating replica scrub. @@ -133,6 +131,7 @@ class ScrubMachine : public sc::state_machine { void my_states() const; void assert_not_active() const; [[nodiscard]] bool is_reserving() const; + [[nodiscard]] bool is_accepting_updates() const; }; /** @@ -192,13 +191,9 @@ struct ActiveScrubbing : sc::state ~ActiveScrubbing(); using reactions = mpl::list< - // done scrubbing - sc::transition, - sc::custom_reaction, sc::custom_reaction>; - sc::result react(const AllChunksDone&); sc::result react(const FullReset&); sc::result react(const InternalError&); }; diff --git a/src/osd/scrub_machine_lstnr.h b/src/osd/scrub_machine_lstnr.h index 167e5f618f51..91dee910af2c 100644 --- a/src/osd/scrub_machine_lstnr.h +++ b/src/osd/scrub_machine_lstnr.h @@ -63,6 +63,8 @@ struct ScrubMachineListener { virtual ~ScrubMachineListener() = default; + [[nodiscard]] virtual bool is_primary() const = 0; + virtual void select_range_n_notify() = 0; virtual Scrub::BlockedRangeWarning acquire_blocked_alarm() = 0; diff --git a/src/osd/scrubber_common.h b/src/osd/scrubber_common.h index 30883bf8bf38..22913875813e 100644 --- a/src/osd/scrubber_common.h +++ b/src/osd/scrubber_common.h @@ -152,6 +152,8 @@ struct ScrubPgIF { virtual void send_maps_compared(epoch_t epoch_queued) = 0; + virtual void on_applied_when_primary(const eversion_t &applied_version) = 0; + // -------------------------------------------------- [[nodiscard]] virtual bool are_callbacks_pending()