From: Bill Scales Date: Wed, 26 Mar 2025 12:40:13 +0000 (+0000) Subject: osd: EC optimizations: Get log twice when auth_log_shard is a non-primary X-Git-Tag: v20.3.0~29^2~13 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=ea67439f3c5ae842f985ee492a1de9ea2aefe2de;p=ceph.git osd: EC optimizations: Get log twice when auth_log_shard is a non-primary When an event such as splitting the PG occurs the new primary does not have any log at the start of peering. Non-primary shards in an EC optimized pool may not have a complete log of writes due to partial writes. If the choosen authorative shard is a non-primary shard then the new primary needs to first get a full copy of the log (which extends past the authorative shard log) from another shard and then repeat the get log step to get the authorative shard's log so it can be merged rewinding divergent entries. Signed-off-by: Bill Scales --- diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 09f2b45f0fe9b..483fe12bd3d84 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -1565,6 +1565,7 @@ void PeeringState::reject_reservation() map::const_iterator PeeringState::find_best_info( const map &infos, bool restrict_to_up_acting, + bool exclude_nonprimary_shards, bool *history_les_bound) const { ceph_assert(history_les_bound); @@ -1612,6 +1613,10 @@ map::const_iterator PeeringState::find_best_info( // Disqualify anyone who is incomplete (not fully backfilled) if (p->second.is_incomplete()) continue; + // If requested disqualify nonprimary shards (may have a sparse log) + if (exclude_nonprimary_shards && + pool.info.is_nonprimary_shard(shard_id_t(p->first.shard))) + continue; if (best == infos.end()) { best = p; continue; @@ -2408,6 +2413,7 @@ void PeeringState::choose_async_recovery_replicated( bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, bool restrict_to_up_acting, bool *history_les_bound, + bool *repeat_getlog, bool request_pg_temp_change_only) { map all_info(peer_info.begin(), peer_info.end()); @@ -2421,7 +2427,28 @@ bool PeeringState::choose_acting(pg_shard_t &auth_log_shard_id, } auto auth_log_shard = find_best_info(all_info, restrict_to_up_acting, - history_les_bound); + false, history_les_bound); + + if ((repeat_getlog != nullptr) && + auth_log_shard != all_info.end() && + (info.last_update < auth_log_shard->second.last_update) && + pool.info.is_nonprimary_shard(auth_log_shard->first.shard)) { + // Only EC pools with ec_optimizations enabled: + // Our log is behind that of the auth_log_shard which is a + // non-primary shard and hence may have a sparse log, + // get a complete log from a primary shard first then + // repeat this step in the state machine to work out what + // has to be rolled backwards + psdout(10) << "auth_log_shard " << auth_log_shard->first + << " is ahead but is a non_primary shard" << dendl; + auth_log_shard = find_best_info(all_info, restrict_to_up_acting, + true, history_les_bound); + if (auth_log_shard != all_info.end()) { + psdout(10) << "auth_log_shard " << auth_log_shard->first + << " selected instead" << dendl; + *repeat_getlog = true; + } + } if (auth_log_shard == all_info.end()) { if (up != acting) { @@ -5970,7 +5997,7 @@ PeeringState::Recovering::react(const RequestBackfill &evt) pg_shard_t auth_log_shard; bool history_les_bound = false; // FIXME: Uh-oh we have to check this return value; choose_acting can fail! - ps->choose_acting(auth_log_shard, true, &history_les_bound); + ps->choose_acting(auth_log_shard, true, &history_les_bound, nullptr); } return transit(); } @@ -6044,12 +6071,12 @@ PeeringState::Recovered::Recovered(my_context ctx) // adjust acting set? (e.g. because backfill completed...) bool history_les_bound = false; - if (ps->acting != ps->up && !ps->choose_acting(auth_log_shard, - true, &history_les_bound)) { + if (ps->acting != ps->up && + !ps->choose_acting(auth_log_shard, true, &history_les_bound, nullptr)) { ceph_assert(ps->want_acting.size()); } else if (!ps->async_recovery_targets.empty()) { // FIXME: Uh-oh we have to check this return value; choose_acting can fail! - ps->choose_acting(auth_log_shard, true, &history_les_bound); + ps->choose_acting(auth_log_shard, true, &history_les_bound, nullptr); } if (context< Active >().all_replicas_activated && @@ -6197,7 +6224,7 @@ boost::statechart::result PeeringState::Active::react(const AdvMap& advmap) pg_shard_t auth_log_shard; bool history_les_bound = false; ps->remove_down_peer_info(advmap.osdmap); - ps->choose_acting(auth_log_shard, false, &history_les_bound, true); + ps->choose_acting(auth_log_shard, false, &history_les_bound, nullptr, true); } /* Check for changes in pool size (if the acting set changed as a result, @@ -6278,7 +6305,7 @@ boost::statechart::result PeeringState::Active::react(const MNotifyRec& notevt) pg_shard_t auth_log_shard; bool history_les_bound = false; // FIXME: Uh-oh we have to check this return value; choose_acting can fail! - ps->choose_acting(auth_log_shard, false, &history_les_bound, true); + ps->choose_acting(auth_log_shard, false, &history_les_bound, nullptr, true); if (!ps->want_acting.empty() && ps->want_acting != ps->acting) { psdout(10) << "Active: got notify from previous acting member " << notevt.from << ", requesting pg_temp change" @@ -7133,7 +7160,8 @@ PeeringState::GetLog::GetLog(my_context ctx) // adjust acting? if (!ps->choose_acting(auth_log_shard, false, - &context< Peering >().history_les_bound)) { + &context< Peering >().history_les_bound, + &repeat_getlog)) { if (!ps->want_acting.empty()) { post_event(NeedActingChange()); } else { @@ -7226,6 +7254,16 @@ boost::statechart::result PeeringState::GetLog::react(const GotLog&) ps->proc_master_log(context().get_cur_transaction(), msg->info, std::move(msg->log), std::move(msg->missing), auth_log_shard); + if (repeat_getlog) { + // Only EC pools with ec_optimizations enabled: + // Our log was behind that of the auth_log_shard which was a non-primary + // with a sparse log. We have just got a log from a primary shard to + // catch up and now need to recheck if we need to rollback the log to + // the auth_log_shard + psdout(10) << "repeating auth_log_shard selection" << dendl; + post_event(RepeatGetLog()); + return discard_event(); + } } ps->start_flush(context< PeeringMachine >().get_cur_transaction()); return transit< GetMissing >(); diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index a28c09b52765c..c47b7616a84aa 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -558,6 +558,7 @@ public: TrivialEvent(MakeStray) TrivialEvent(NeedActingChange) TrivialEvent(IsIncomplete) + TrivialEvent(RepeatGetLog) TrivialEvent(IsDown) TrivialEvent(AllReplicasRecovered) @@ -1312,6 +1313,7 @@ public: struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState { pg_shard_t auth_log_shard; + bool repeat_getlog = false; boost::intrusive_ptr msg; explicit GetLog(my_context ctx); @@ -1324,7 +1326,8 @@ public: boost::statechart::custom_reaction< GotLog >, boost::statechart::custom_reaction< AdvMap >, boost::statechart::transition< NeedActingChange, WaitActingChange >, - boost::statechart::transition< IsIncomplete, Incomplete > + boost::statechart::transition< IsIncomplete, Incomplete >, + boost::statechart::transition< RepeatGetLog, GetLog> > reactions; boost::statechart::result react(const AdvMap&); boost::statechart::result react(const QueryState& q); @@ -1668,6 +1671,7 @@ private: std::map::const_iterator find_best_info( const std::map &infos, bool restrict_to_up_acting, + bool exclude_nonprimary_shards, bool *history_les_bound) const; static void calc_ec_acting( @@ -1740,6 +1744,7 @@ private: bool choose_acting(pg_shard_t &auth_log_shard, bool restrict_to_up_acting, bool *history_les_bound, + bool *repeat_getlog, bool request_pg_temp_change_only = false); bool search_for_missing(