From 0ebb32213a84c7f3355046c4456b35238a730c23 Mon Sep 17 00:00:00 2001 From: Ronen Friedman Date: Tue, 5 Jul 2022 13:42:33 +0000 Subject: [PATCH] osd/scrub: pre-prepare the description text of the current state The existing code accesses the states of the scrub FSM via the standard "state iterator", which is definitely not thread-safe. It is also a bit time consuming. As it happened, new code that tried to create the state string while not holding the PG lock - crashed the OSD. This PR changes the relevant lines to allow for safe and fast access to the state description text. Signed-off-by: Ronen Friedman --- src/osd/scrubber/pg_scrubber.cc | 5 +- src/osd/scrubber/pg_scrubber.h | 7 +++ src/osd/scrubber/scrub_machine.cc | 74 ++++++++++++++++---------- src/osd/scrubber/scrub_machine.h | 41 ++++++++------ src/osd/scrubber/scrub_machine_lstnr.h | 4 ++ 5 files changed, 86 insertions(+), 45 deletions(-) diff --git a/src/osd/scrubber/pg_scrubber.cc b/src/osd/scrubber/pg_scrubber.cc index f27a9c79a9c41..1816617a53345 100644 --- a/src/osd/scrubber/pg_scrubber.cc +++ b/src/osd/scrubber/pg_scrubber.cc @@ -2359,11 +2359,10 @@ ostream &operator<<(ostream &out, const PgScrubber &scrubber) { std::ostream& PgScrubber::gen_prefix(std::ostream& out) const { - const auto fsm_state = m_fsm ? m_fsm->current_states_desc() : "- :"; if (m_pg) { - return m_pg->gen_prefix(out) << "scrubber " << fsm_state << ": "; + return m_pg->gen_prefix(out) << "scrubber<" << m_fsm_state_name << ">: "; } else { - return out << " scrubber [~] " << fsm_state << ": "; + return out << " scrubber [" << m_pg_id << "]: "; } } diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h index 06f52e0bc28f6..2f28b3281fbe0 100644 --- a/src/osd/scrubber/pg_scrubber.h +++ b/src/osd/scrubber/pg_scrubber.h @@ -456,6 +456,11 @@ class PgScrubber : public ScrubPgIF, return m_pg->recovery_state.is_primary(); } + void set_state_name(const char* name) final + { + m_fsm_state_name = name; + } + void select_range_n_notify() final; Scrub::BlockedRangeWarning acquire_blocked_alarm() final; @@ -714,6 +719,8 @@ class PgScrubber : public ScrubPgIF, bool deep); std::unique_ptr m_fsm; + /// the FSM state, as a string for logging + const char* m_fsm_state_name{nullptr}; const spg_t m_pg_id; ///< a local copy of m_pg->pg_id OSDService* const m_osds; const pg_shard_t m_pg_whoami; ///< a local copy of m_pg->pg_whoami; diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc index 7cc789571aa2b..d63c50fafc676 100644 --- a/src/osd/scrubber/scrub_machine.cc +++ b/src/osd/scrubber/scrub_machine.cc @@ -26,6 +26,11 @@ using namespace std::chrono_literals; auto pg_id = context().m_pg_id; \ std::ignore = pg_id; +NamedSimply::NamedSimply(ScrubMachineListener* scrubber, const char* name) +{ + scrubber->set_state_name(name); +} + namespace Scrub { // --------- trace/debug auxiliaries ------------------------------- @@ -40,19 +45,6 @@ void on_event_discard(std::string_view nm) dout(20) << " event: --^^^^---- " << nm << dendl; } -std::string ScrubMachine::current_states_desc() const -{ - std::string sts{"<"}; - for (auto si = state_begin(); si != state_end(); ++si) { - const auto& siw{*si}; // prevents a warning re side-effects - // the '7' is the size of the 'scrub::' - sts += - boost::core::demangle(typeid(siw).name()).substr(7, std::string::npos) + - "/"; - } - return sts + ">"; -} - void ScrubMachine::assert_not_active() const { ceph_assert(state_cast()); @@ -90,7 +82,9 @@ std::ostream& ScrubMachine::gen_prefix(std::ostream& out) const // ----------------------- NotActive ----------------------------------------- -NotActive::NotActive(my_context ctx) : my_base(ctx) +NotActive::NotActive(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "NotActive") { dout(10) << "-- state -->> NotActive" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -115,7 +109,9 @@ sc::result NotActive::react(const AfterRepairScrub&) // ----------------------- ReservingReplicas --------------------------------- -ReservingReplicas::ReservingReplicas(my_context ctx) : my_base(ctx) +ReservingReplicas::ReservingReplicas(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "ReservingReplicas") { dout(10) << "-- state -->> ReservingReplicas" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -153,7 +149,9 @@ sc::result ReservingReplicas::react(const FullReset&) // ----------------------- ActiveScrubbing ----------------------------------- -ActiveScrubbing::ActiveScrubbing(my_context ctx) : my_base(ctx) +ActiveScrubbing::ActiveScrubbing(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "ActiveScrubbing") { dout(10) << "-- state -->> ActiveScrubbing" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -203,7 +201,9 @@ sc::result ActiveScrubbing::react(const FullReset&) * If that happens, all we can do is to issue a warning message to help * with the debugging. */ -RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) +RangeBlocked::RangeBlocked(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/RangeBlocked") { dout(10) << "-- state -->> Act/RangeBlocked" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -218,7 +218,9 @@ RangeBlocked::RangeBlocked(my_context ctx) : my_base(ctx) /** * Sleeping till timer reactivation - or just requeuing */ -PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) +PendingTimer::PendingTimer(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/PendingTimer") { dout(10) << "-- state -->> Act/PendingTimer" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -233,7 +235,9 @@ PendingTimer::PendingTimer(my_context ctx) : my_base(ctx) * - preemption data was set * - epoch start was updated */ -NewChunk::NewChunk(my_context ctx) : my_base(ctx) +NewChunk::NewChunk(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/NewChunk") { dout(10) << "-- state -->> Act/NewChunk" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -258,7 +262,9 @@ sc::result NewChunk::react(const SelectedChunkFree&) // ----------------------- WaitPushes ----------------------------------- -WaitPushes::WaitPushes(my_context ctx) : my_base(ctx) +WaitPushes::WaitPushes(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/WaitPushes") { dout(10) << " -- state -->> Act/WaitPushes" << dendl; post_event(ActivePushesUpd{}); @@ -284,7 +290,9 @@ sc::result WaitPushes::react(const ActivePushesUpd&) // ----------------------- WaitLastUpdate ----------------------------------- -WaitLastUpdate::WaitLastUpdate(my_context ctx) : my_base(ctx) +WaitLastUpdate::WaitLastUpdate(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/WaitLastUpdate") { dout(10) << " -- state -->> Act/WaitLastUpdate" << dendl; post_event(UpdatesApplied{}); @@ -326,7 +334,9 @@ sc::result WaitLastUpdate::react(const InternalAllUpdates&) // ----------------------- BuildMap ----------------------------------- -BuildMap::BuildMap(my_context ctx) : my_base(ctx) +BuildMap::BuildMap(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/BuildMap") { dout(10) << " -- state -->> Act/BuildMap" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -374,7 +384,9 @@ sc::result BuildMap::react(const IntLocalMapDone&) // ----------------------- DrainReplMaps ----------------------------------- -DrainReplMaps::DrainReplMaps(my_context ctx) : my_base(ctx) +DrainReplMaps::DrainReplMaps(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/DrainReplMaps") { dout(10) << "-- state -->> Act/DrainReplMaps" << dendl; // we may have got all maps already. Send the event that will make us check. @@ -399,7 +411,9 @@ sc::result DrainReplMaps::react(const GotReplicas&) // ----------------------- WaitReplicas ----------------------------------- -WaitReplicas::WaitReplicas(my_context ctx) : my_base(ctx) +WaitReplicas::WaitReplicas(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/WaitReplicas") { dout(10) << "-- state -->> Act/WaitReplicas" << dendl; post_event(GotReplicas{}); @@ -460,7 +474,9 @@ sc::result WaitReplicas::react(const DigestUpdate&) // ----------------------- WaitDigestUpdate ----------------------------------- -WaitDigestUpdate::WaitDigestUpdate(my_context ctx) : my_base(ctx) +WaitDigestUpdate::WaitDigestUpdate(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "Act/WaitDigestUpdate") { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "-- state -->> Act/WaitDigestUpdate" << dendl; @@ -505,7 +521,9 @@ ScrubMachine::~ScrubMachine() = default; // ----------------------- ReplicaWaitUpdates -------------------------------- -ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) : my_base(ctx) +ReplicaWaitUpdates::ReplicaWaitUpdates(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "ReplicaWaitUpdates") { dout(10) << "-- state -->> ReplicaWaitUpdates" << dendl; DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases @@ -541,7 +559,9 @@ sc::result ReplicaWaitUpdates::react(const FullReset&) // ----------------------- ActiveReplica ----------------------------------- -ActiveReplica::ActiveReplica(my_context ctx) : my_base(ctx) +ActiveReplica::ActiveReplica(my_context ctx) + : my_base(ctx) + , NamedSimply(context().m_scrbr, "ActiveReplica") { DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases dout(10) << "-- state -->> ActiveReplica" << dendl; diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h index fdd6250904239..6236fa5d23f68 100644 --- a/src/osd/scrubber/scrub_machine.h +++ b/src/osd/scrubber/scrub_machine.h @@ -20,8 +20,16 @@ #include "scrub_machine_lstnr.h" +/// a wrapper that sets the FSM state description used by the +/// PgScrubber +/// \todo consider using the full NamedState as in Peering +struct NamedSimply { + explicit NamedSimply(ScrubMachineListener* scrubber, const char* name); +}; + class PG; // holding a pointer to that one - just for testing class PgScrubber; + namespace Scrub { namespace sc = ::boost::statechart; @@ -145,7 +153,6 @@ class ScrubMachine : public sc::state_machine { ScrubMachineListener* m_scrbr; std::ostream& gen_prefix(std::ostream& out) const; - std::string current_states_desc() const; void assert_not_active() const; [[nodiscard]] bool is_reserving() const; [[nodiscard]] bool is_accepting_updates() const; @@ -169,7 +176,7 @@ class ScrubMachine : public sc::state_machine { * using the resource-request to identify and tag the scrub session, this * bypass cannot be supported anymore. */ -struct NotActive : sc::state { +struct NotActive : sc::state, NamedSimply { explicit NotActive(my_context ctx); using reactions = @@ -182,7 +189,8 @@ struct NotActive : sc::state { sc::result react(const AfterRepairScrub&); }; -struct ReservingReplicas : sc::state { +struct ReservingReplicas : sc::state, + NamedSimply { explicit ReservingReplicas(my_context ctx); ~ReservingReplicas(); @@ -222,7 +230,7 @@ struct WaitReplicas; struct WaitDigestUpdate; struct ActiveScrubbing - : sc::state { + : sc::state, NamedSimply { explicit ActiveScrubbing(my_context ctx); ~ActiveScrubbing(); @@ -234,21 +242,21 @@ struct ActiveScrubbing sc::result react(const InternalError&); }; -struct RangeBlocked : sc::state { +struct RangeBlocked : sc::state, NamedSimply { explicit RangeBlocked(my_context ctx); using reactions = mpl::list>; Scrub::BlockedRangeWarning m_timeout; }; -struct PendingTimer : sc::state { +struct PendingTimer : sc::state, NamedSimply { explicit PendingTimer(my_context ctx); using reactions = mpl::list>; }; -struct NewChunk : sc::state { +struct NewChunk : sc::state, NamedSimply { explicit NewChunk(my_context ctx); @@ -267,7 +275,7 @@ struct NewChunk : sc::state { * (in-flight data to the Objectstore is not readable until written to * disk, termed 'applied' here) */ -struct WaitPushes : sc::state { +struct WaitPushes : sc::state, NamedSimply { explicit WaitPushes(my_context ctx); @@ -276,7 +284,8 @@ struct WaitPushes : sc::state { sc::result react(const ActivePushesUpd&); }; -struct WaitLastUpdate : sc::state { +struct WaitLastUpdate : sc::state, + NamedSimply { explicit WaitLastUpdate(my_context ctx); @@ -291,7 +300,7 @@ struct WaitLastUpdate : sc::state { sc::result react(const InternalAllUpdates&); }; -struct BuildMap : sc::state { +struct BuildMap : sc::state, NamedSimply { explicit BuildMap(my_context ctx); // possible error scenarios: @@ -312,7 +321,7 @@ struct BuildMap : sc::state { /* * "drain" scrub-maps responses from replicas */ -struct DrainReplMaps : sc::state { +struct DrainReplMaps : sc::state, NamedSimply { explicit DrainReplMaps(my_context ctx); using reactions = @@ -322,7 +331,7 @@ struct DrainReplMaps : sc::state { sc::result react(const GotReplicas&); }; -struct WaitReplicas : sc::state { +struct WaitReplicas : sc::state, NamedSimply { explicit WaitReplicas(my_context ctx); using reactions = mpl::list< @@ -336,7 +345,8 @@ struct WaitReplicas : sc::state { bool all_maps_already_called{false}; // see comment in react code }; -struct WaitDigestUpdate : sc::state { +struct WaitDigestUpdate : sc::state, + NamedSimply { explicit WaitDigestUpdate(my_context ctx); using reactions = mpl::list, @@ -355,7 +365,8 @@ struct WaitDigestUpdate : sc::state { * - the details of the Primary's request were internalized by PgScrubber; * - 'active' scrubbing is set */ -struct ReplicaWaitUpdates : sc::state { +struct ReplicaWaitUpdates : sc::state, + NamedSimply { explicit ReplicaWaitUpdates(my_context ctx); using reactions = mpl::list, sc::custom_reaction>; @@ -365,7 +376,7 @@ struct ReplicaWaitUpdates : sc::state { }; -struct ActiveReplica : sc::state { +struct ActiveReplica : sc::state, NamedSimply { explicit ActiveReplica(my_context ctx); using reactions = mpl::list, sc::custom_reaction, diff --git a/src/osd/scrubber/scrub_machine_lstnr.h b/src/osd/scrubber/scrub_machine_lstnr.h index 9135c3257af41..a8e77d075eb15 100644 --- a/src/osd/scrubber/scrub_machine_lstnr.h +++ b/src/osd/scrubber/scrub_machine_lstnr.h @@ -80,6 +80,10 @@ struct ScrubMachineListener { virtual ~ScrubMachineListener() = default; + /// set the string we'd use in logs to convey the current state-machine + /// state. + virtual void set_state_name(const char* name) = 0; + [[nodiscard]] virtual bool is_primary() const = 0; virtual void select_range_n_notify() = 0; -- 2.39.5