From: Samuel Just Date: Sat, 30 Mar 2024 23:23:01 +0000 (-0700) Subject: osd,crimson/osd: maintain pg_committed_to on replica rather than min_last_complete_ondisk X-Git-Tag: v20.0.0~707^2~18 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6be4d397d027b1b824298767a4a8c1372a3aea62;p=ceph.git osd,crimson/osd: maintain pg_committed_to on replica rather than min_last_complete_ondisk This commit updates the bulk of the interface pathways in crimson and classic to refer to pg_committed_to rather than min_last_complete_ondisk and changes the replica side to maintain pg_committed_to instead. This commit shouldn't actually cause any behavior change -- we're still passing min_last_complete_ondisk (which is a valid lower bound for pg_committed_to!). Signed-off-by: Samuel Just --- diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index 9bf60140374c..7585c96897cc 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -828,7 +828,7 @@ void OpsExecuter::fill_op_params(OpsExecuter::modified_by m) osd_op_params->mtime = msg->get_mtime(); osd_op_params->at_version = pg->get_next_version(); osd_op_params->pg_trim_to = pg->get_pg_trim_to(); - osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk(); + osd_op_params->pg_committed_to = pg->get_min_last_complete_ondisk(); osd_op_params->last_complete = pg->get_info().last_complete; osd_op_params->user_modify = (m == modified_by::user); } diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h index 102cb7fff6b6..14202582100b 100644 --- a/src/crimson/osd/osd_operations/osdop_params.h +++ b/src/crimson/osd/osd_operations/osdop_params.h @@ -12,7 +12,7 @@ struct osd_op_params_t { utime_t mtime; eversion_t at_version; eversion_t pg_trim_to; - eversion_t min_last_complete_ondisk; + eversion_t pg_committed_to; eversion_t last_complete; bool user_modify = false; ObjectCleanRegions clean_regions; diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index e9cf4841ff2a..86941f105168 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -1272,7 +1272,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref req) log_operation(std::move(log_entries), req->pg_trim_to, req->version, - req->min_last_complete_ondisk, + req->pg_committed_to, !txn.empty(), txn, false); @@ -1318,7 +1318,7 @@ void PG::log_operation( std::vector&& logv, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, bool transaction_applied, ObjectStore::Transaction &txn, bool async) { @@ -1348,7 +1348,7 @@ void PG::log_operation( peering_state.append_log(std::move(logv), trim_to, roll_forward_to, - min_last_complete_ondisk, + pg_committed_to, txn, !txn.empty(), false); diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 604f49005ff0..f37775bc2863 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -603,7 +603,7 @@ public: std::vector&& logv, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_commited_to, bool transaction_applied, ObjectStore::Transaction &txn, bool async = false); diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc index cbb8c883e075..12ee38b43705 100644 --- a/src/crimson/osd/replicated_backend.cc +++ b/src/crimson/osd/replicated_backend.cc @@ -84,7 +84,7 @@ ReplicatedBackend::submit_transaction(const std::set& pg_shards, pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}}); encode(log_entries, m->logbl); m->pg_trim_to = osd_op_p.pg_trim_to; - m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk; + m->pg_committed_to = osd_op_p.pg_committed_to; m->pg_stats = pg.get_info().stats; // TODO: set more stuff. e.g., pg_states sends->emplace_back( @@ -99,7 +99,7 @@ ReplicatedBackend::submit_transaction(const std::set& pg_shards, std::move(log_entries), osd_op_p.pg_trim_to, osd_op_p.at_version, - osd_op_p.min_last_complete_ondisk, + osd_op_p.pg_committed_to, true, txn, false); diff --git a/src/messages/MOSDRepOp.h b/src/messages/MOSDRepOp.h index cb246d8e1beb..5e8b386ba0a5 100644 --- a/src/messages/MOSDRepOp.h +++ b/src/messages/MOSDRepOp.h @@ -54,7 +54,30 @@ public: // piggybacked osd/og state eversion_t pg_trim_to; // primary->replica: trim to here - eversion_t min_last_complete_ondisk; // lower bound on committed version + + /** + * pg_committed_to + * + * Used by the primary to propagate pg_committed_to to replicas for use in + * serving replica reads. + * + * Because updates <= pg_committed_to cannot become divergent, replicas + * may safely serve reads on objects which do not have more recent updates. + * + * See PeeringState::pg_committed_to, PeeringState::can_serve_replica_read + * + * Historical note: Prior to early 2024, this field was named + * min_last_complete_ondisk. The replica, however, only actually relied on + * a single property of this field -- that any objects not modified since + * mlcod couldn't have uncommitted state. Weakening the field to the condition + * above is therefore safe -- mlcod is always <= pg_committed_to and + * sending pg_committed_to to a replica expecting mlcod will work correctly + * as it only actually uses mlcod to check replica reads. The primary difference + * between mlcod and pg_committed_to is simply that mlcod doesn't advance past + * objects missing on replicas, but we check for that anyway. This note may be + * removed in main after U is released. + */ + eversion_t pg_committed_to; hobject_t new_temp_oid; ///< new temp object that we must now start tracking hobject_t discard_temp_oid; ///< previously used temp object that we can now stop tracking @@ -111,7 +134,7 @@ public: decode(updated_hit_set_history, p); ceph_assert(header.version >= 3); - decode(min_last_complete_ondisk, p); + decode(pg_committed_to, p); final_decode_needed = false; } @@ -135,7 +158,7 @@ public: encode(discard_temp_oid, payload); encode(from, payload); encode(updated_hit_set_history, payload); - encode(min_last_complete_ondisk, payload); + encode(pg_committed_to, payload); } MOSDRepOp() @@ -170,7 +193,7 @@ public: out << " " << poid << " v " << version; if (updated_hit_set_history) out << ", has_updated_hit_set_history"; - out << ", mlcod=" << min_last_complete_ondisk; + out << ", pct=" << pg_committed_to; } out << ")"; } diff --git a/src/osd/ECCommon.h b/src/osd/ECCommon.h index edde95b6ac56..2727b76d2430 100644 --- a/src/osd/ECCommon.h +++ b/src/osd/ECCommon.h @@ -200,7 +200,7 @@ struct ECListener { const std::optional &hset_history, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, bool transaction_applied, ceph::os::Transaction &t, bool async = false) = 0; diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 9cbb5e8e97ce..362226006bab 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -219,7 +219,7 @@ typedef std::shared_ptr OSDMapRef; const std::optional &hset_history, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, bool transaction_applied, ObjectStore::Transaction &t, bool async = false) = 0; @@ -435,8 +435,8 @@ typedef std::shared_ptr OSDMapRef; const eversion_t &at_version, ///< [in] version PGTransactionUPtr &&t, ///< [in] trans to execute (move) const eversion_t &trim_to, ///< [in] trim log to here - const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on - /// committed version + const eversion_t &pg_committed_to, ///< [in] lower bound on + /// committed version std::vector&& log_entries, ///< [in] log entries for t /// [in] hitset history (if updated with this transaction) std::optional &hset_history, diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc index 5958c62c3457..664852a5a5ef 100644 --- a/src/osd/PeeringState.cc +++ b/src/osd/PeeringState.cc @@ -1404,9 +1404,8 @@ bool PeeringState::needs_backfill() const bool PeeringState::can_serve_replica_read(const hobject_t &hoid) { ceph_assert(!is_primary()); - eversion_t min_last_complete_ondisk = get_min_last_complete_ondisk(); if (!pg_log.get_log().has_write_since( - hoid, min_last_complete_ondisk)) { + hoid, pg_committed_to)) { psdout(20) << "can be safely read on this replica" << dendl; return true; } else { @@ -4188,7 +4187,7 @@ void PeeringState::append_log( vector&& logv, eversion_t trim_to, eversion_t roll_forward_to, - eversion_t mlcod, + eversion_t pct, ObjectStore::Transaction &t, bool transaction_applied, bool async) @@ -4254,7 +4253,7 @@ void PeeringState::append_log( write_if_dirty(t); if (!is_primary()) - min_last_complete_ondisk = mlcod; + pg_committed_to = pct; } void PeeringState::recover_got( @@ -7579,7 +7578,8 @@ ostream &operator<<(ostream &out, const PeeringState &ps) { if (ps.last_complete_ondisk != ps.info.last_complete) out << " lcod " << ps.last_complete_ondisk; - out << " mlcod " << ps.min_last_complete_ondisk; + if (ps.is_primary()) + out << " mlcod " << ps.min_last_complete_ondisk; out << " " << pg_state_string(ps.get_state()); if (ps.should_send_notify()) diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h index 0a7b30823e60..234a24c7a758 100644 --- a/src/osd/PeeringState.h +++ b/src/osd/PeeringState.h @@ -1924,7 +1924,7 @@ public: std::vector&& logv, eversion_t trim_to, eversion_t roll_forward_to, - eversion_t min_last_complete_ondisk, + eversion_t pg_committed_to, ObjectStore::Transaction &t, bool transaction_applied, bool async); diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 7daa648d72c7..3ad7d41f16bb 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -497,7 +497,7 @@ public: const std::optional &hset_history, const eversion_t &trim_to, const eversion_t &roll_forward_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, bool transaction_applied, ObjectStore::Transaction &t, bool async = false) override { @@ -519,7 +519,7 @@ public: replica_clear_repop_obc(logv, t); } recovery_state.append_log( - std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk, + std::move(logv), trim_to, roll_forward_to, pg_committed_to, t, transaction_applied, async); } diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index a2895511dffa..beb379ca0594 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -468,7 +468,7 @@ void ReplicatedBackend::submit_transaction( const eversion_t &at_version, PGTransactionUPtr &&_t, const eversion_t &trim_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, vector&& _log_entries, std::optional &hset_history, Context *on_all_commit, @@ -517,7 +517,7 @@ void ReplicatedBackend::submit_transaction( tid, reqid, trim_to, - min_last_complete_ondisk, + pg_committed_to, added.size() ? *(added.begin()) : hobject_t(), removed.size() ? *(removed.begin()) : hobject_t(), log_entries, @@ -533,7 +533,7 @@ void ReplicatedBackend::submit_transaction( hset_history, trim_to, at_version, - min_last_complete_ondisk, + pg_committed_to, true, op_t); @@ -953,7 +953,7 @@ Message * ReplicatedBackend::generate_subop( ceph_tid_t tid, osd_reqid_t reqid, eversion_t pg_trim_to, - eversion_t min_last_complete_ondisk, + eversion_t pg_committed_to, hobject_t new_temp_oid, hobject_t discard_temp_oid, const bufferlist &log_entries, @@ -992,7 +992,7 @@ Message * ReplicatedBackend::generate_subop( // this feature is from 2019 (6f12bf27cb91), assume present ceph_assert(HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD)); - wr->min_last_complete_ondisk = min_last_complete_ondisk; + wr->pg_committed_to = pg_committed_to; wr->new_temp_oid = new_temp_oid; wr->discard_temp_oid = discard_temp_oid; @@ -1006,7 +1006,7 @@ void ReplicatedBackend::issue_op( ceph_tid_t tid, osd_reqid_t reqid, eversion_t pg_trim_to, - eversion_t min_last_complete_ondisk, + eversion_t pg_committed_to, hobject_t new_temp_oid, hobject_t discard_temp_oid, const vector &log_entries, @@ -1039,7 +1039,7 @@ void ReplicatedBackend::issue_op( tid, reqid, pg_trim_to, - min_last_complete_ondisk, + pg_committed_to, new_temp_oid, discard_temp_oid, logs, @@ -1141,7 +1141,7 @@ void ReplicatedBackend::do_repop(OpRequestRef op) m->updated_hit_set_history, m->pg_trim_to, m->version, /* Replicated PGs don't have rollback info */ - m->min_last_complete_ondisk, + m->pg_committed_to, update_snaps, rm->localt, async); diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h index aab75d21c737..2f3c1ea2509e 100644 --- a/src/osd/ReplicatedBackend.h +++ b/src/osd/ReplicatedBackend.h @@ -356,7 +356,7 @@ public: const eversion_t &at_version, PGTransactionUPtr &&t, const eversion_t &trim_to, - const eversion_t &min_last_complete_ondisk, + const eversion_t &pg_committed_to, std::vector&& log_entries, std::optional &hset_history, Context *on_all_commit, @@ -372,7 +372,7 @@ private: ceph_tid_t tid, osd_reqid_t reqid, eversion_t pg_trim_to, - eversion_t min_last_complete_ondisk, + eversion_t pg_committed_to, hobject_t new_temp_oid, hobject_t discard_temp_oid, const ceph::buffer::list &log_entries, @@ -386,7 +386,7 @@ private: ceph_tid_t tid, osd_reqid_t reqid, eversion_t pg_trim_to, - eversion_t min_last_complete_ondisk, + eversion_t pg_committed_to, hobject_t new_temp_oid, hobject_t discard_temp_oid, const std::vector &log_entries,