]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd,crimson/osd: maintain pg_committed_to on replica rather than min_last_complete_ondisk
authorSamuel Just <sjust@redhat.com>
Sat, 30 Mar 2024 23:23:01 +0000 (16:23 -0700)
committerSamuel Just <sjust@redhat.com>
Fri, 18 Oct 2024 20:33:57 +0000 (20:33 +0000)
This commit updates the bulk of the interface pathways in crimson and
classic to refer to pg_committed_to rather than min_last_complete_ondisk
and changes the replica side to maintain pg_committed_to instead.

This commit shouldn't actually cause any behavior change -- we're still
passing min_last_complete_ondisk (which is a valid lower bound for
pg_committed_to!).

Signed-off-by: Samuel Just <sjust@redhat.com>
13 files changed:
src/crimson/osd/ops_executer.cc
src/crimson/osd/osd_operations/osdop_params.h
src/crimson/osd/pg.cc
src/crimson/osd/pg.h
src/crimson/osd/replicated_backend.cc
src/messages/MOSDRepOp.h
src/osd/ECCommon.h
src/osd/PGBackend.h
src/osd/PeeringState.cc
src/osd/PeeringState.h
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedBackend.h

index 9bf60140374c8469c06fdd0a7a22d945f4494901..7585c96897cc02c3dcb5a2c5412b1acf1690fd73 100644 (file)
@@ -828,7 +828,7 @@ void OpsExecuter::fill_op_params(OpsExecuter::modified_by m)
   osd_op_params->mtime = msg->get_mtime();
   osd_op_params->at_version = pg->get_next_version();
   osd_op_params->pg_trim_to = pg->get_pg_trim_to();
-  osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk();
+  osd_op_params->pg_committed_to = pg->get_min_last_complete_ondisk();
   osd_op_params->last_complete = pg->get_info().last_complete;
   osd_op_params->user_modify = (m == modified_by::user);
 }
index 102cb7fff6b61d2694c379de107b3d67e6eca724..14202582100b3d0095213faf932e66a29b32421e 100644 (file)
@@ -12,7 +12,7 @@ struct osd_op_params_t {
   utime_t mtime;
   eversion_t at_version;
   eversion_t pg_trim_to;
-  eversion_t min_last_complete_ondisk;
+  eversion_t pg_committed_to;
   eversion_t last_complete;
   bool user_modify = false;
   ObjectCleanRegions clean_regions;
index e9cf4841ff2a1b4a73d3578269ab23e3bf01325d..86941f10516884a2a97d97fdf1a6e584f54df54c 100644 (file)
@@ -1272,7 +1272,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
   log_operation(std::move(log_entries),
                 req->pg_trim_to,
                 req->version,
-                req->min_last_complete_ondisk,
+                req->pg_committed_to,
                 !txn.empty(),
                 txn,
                 false);
@@ -1318,7 +1318,7 @@ void PG::log_operation(
   std::vector<pg_log_entry_t>&& logv,
   const eversion_t &trim_to,
   const eversion_t &roll_forward_to,
-  const eversion_t &min_last_complete_ondisk,
+  const eversion_t &pg_committed_to,
   bool transaction_applied,
   ObjectStore::Transaction &txn,
   bool async) {
@@ -1348,7 +1348,7 @@ void PG::log_operation(
   peering_state.append_log(std::move(logv),
                            trim_to,
                            roll_forward_to,
-                           min_last_complete_ondisk,
+                           pg_committed_to,
                            txn,
                            !txn.empty(),
                            false);
index 604f49005ff040a82d1abe8b9a70b178eab0a5b2..f37775bc2863142477d750b6faf9391416eb63c8 100644 (file)
@@ -603,7 +603,7 @@ public:
     std::vector<pg_log_entry_t>&& logv,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_commited_to,
     bool transaction_applied,
     ObjectStore::Transaction &txn,
     bool async = false);
index cbb8c883e075273345278d9400608edf12601bb5..12ee38b4370548c9a403ad314df5ca7db9f1d4ed 100644 (file)
@@ -84,7 +84,7 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
       pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
       encode(log_entries, m->logbl);
       m->pg_trim_to = osd_op_p.pg_trim_to;
-      m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
+      m->pg_committed_to = osd_op_p.pg_committed_to;
       m->pg_stats = pg.get_info().stats;
       // TODO: set more stuff. e.g., pg_states
       sends->emplace_back(
@@ -99,7 +99,7 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
     std::move(log_entries),
     osd_op_p.pg_trim_to,
     osd_op_p.at_version,
-    osd_op_p.min_last_complete_ondisk,
+    osd_op_p.pg_committed_to,
     true,
     txn,
     false);
index cb246d8e1beb06c05784f02701bbf749529a2eb4..5e8b386ba0a519489e7fe88278c447972c849808 100644 (file)
@@ -54,7 +54,30 @@ public:
 
   // piggybacked osd/og state
   eversion_t pg_trim_to;   // primary->replica: trim to here
-  eversion_t min_last_complete_ondisk; // lower bound on committed version
+
+  /**
+   * pg_committed_to
+   *
+   * Used by the primary to propagate pg_committed_to to replicas for use in
+   * serving replica reads.
+   *
+   * Because updates <= pg_committed_to cannot become divergent, replicas
+   * may safely serve reads on objects which do not have more recent updates.
+   *
+   * See PeeringState::pg_committed_to, PeeringState::can_serve_replica_read
+   *
+   * Historical note: Prior to early 2024, this field was named
+   * min_last_complete_ondisk.  The replica, however, only actually relied on
+   * a single property of this field -- that any objects not modified since
+   * mlcod couldn't have uncommitted state.  Weakening the field to the condition
+   * above is therefore safe -- mlcod is always <= pg_committed_to and
+   * sending pg_committed_to to a replica expecting mlcod will work correctly
+   * as it only actually uses mlcod to check replica reads. The primary difference
+   * between mlcod and pg_committed_to is simply that mlcod doesn't advance past
+   * objects missing on replicas, but we check for that anyway.  This note may be
+   * removed in main after U is released.
+   */
+  eversion_t pg_committed_to;
 
   hobject_t new_temp_oid;      ///< new temp object that we must now start tracking
   hobject_t discard_temp_oid;  ///< previously used temp object that we can now stop tracking
@@ -111,7 +134,7 @@ public:
     decode(updated_hit_set_history, p);
 
     ceph_assert(header.version >= 3);
-    decode(min_last_complete_ondisk, p);
+    decode(pg_committed_to, p);
     final_decode_needed = false;
   }
 
@@ -135,7 +158,7 @@ public:
     encode(discard_temp_oid, payload);
     encode(from, payload);
     encode(updated_hit_set_history, payload);
-    encode(min_last_complete_ondisk, payload);
+    encode(pg_committed_to, payload);
   }
 
   MOSDRepOp()
@@ -170,7 +193,7 @@ public:
       out << " " << poid << " v " << version;
       if (updated_hit_set_history)
         out << ", has_updated_hit_set_history";
-      out << ", mlcod=" << min_last_complete_ondisk;
+      out << ", pct=" << pg_committed_to;
     }
     out << ")";
   }
index edde95b6ac563c99e855529f0434f7736bd2d1a0..2727b76d24302bf2ae9d217c3730e58546eb2dd0 100644 (file)
@@ -200,7 +200,7 @@ struct ECListener {
     const std::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     bool transaction_applied,
     ceph::os::Transaction &t,
     bool async = false) = 0;
index 9cbb5e8e97ce8f309029f1bcc3d666ef9a4017e9..362226006babc2adbdc2935610e2d043ba3e1822 100644 (file)
@@ -219,7 +219,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        const std::optional<pg_hit_set_history_t> &hset_history,
        const eversion_t &trim_to,
        const eversion_t &roll_forward_to,
-       const eversion_t &min_last_complete_ondisk,
+       const eversion_t &pg_committed_to,
        bool transaction_applied,
        ObjectStore::Transaction &t,
        bool async = false) = 0;
@@ -435,8 +435,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      const eversion_t &at_version,        ///< [in] version
      PGTransactionUPtr &&t,               ///< [in] trans to execute (move)
      const eversion_t &trim_to,           ///< [in] trim log to here
-     const eversion_t &min_last_complete_ondisk, ///< [in] lower bound on
-                                                 ///  committed version
+     const eversion_t &pg_committed_to,   ///< [in] lower bound on
+                                          ///       committed version
      std::vector<pg_log_entry_t>&& log_entries, ///< [in] log entries for t
      /// [in] hitset history (if updated with this transaction)
      std::optional<pg_hit_set_history_t> &hset_history,
index 5958c62c3457aba1fcd48730eab61b64f6ac8790..664852a5a5ef6ba4fbe64cc762202344ad754b2d 100644 (file)
@@ -1404,9 +1404,8 @@ bool PeeringState::needs_backfill() const
 bool PeeringState::can_serve_replica_read(const hobject_t &hoid)
 {
   ceph_assert(!is_primary());
-  eversion_t min_last_complete_ondisk = get_min_last_complete_ondisk();
   if (!pg_log.get_log().has_write_since(
-      hoid, min_last_complete_ondisk)) {
+      hoid, pg_committed_to)) {
     psdout(20) << "can be safely read on this replica" << dendl;
     return true;
   } else {
@@ -4188,7 +4187,7 @@ void PeeringState::append_log(
   vector<pg_log_entry_t>&& logv,
   eversion_t trim_to,
   eversion_t roll_forward_to,
-  eversion_t mlcod,
+  eversion_t pct,
   ObjectStore::Transaction &t,
   bool transaction_applied,
   bool async)
@@ -4254,7 +4253,7 @@ void PeeringState::append_log(
   write_if_dirty(t);
 
   if (!is_primary())
-    min_last_complete_ondisk = mlcod;
+    pg_committed_to = pct;
 }
 
 void PeeringState::recover_got(
@@ -7579,7 +7578,8 @@ ostream &operator<<(ostream &out, const PeeringState &ps) {
   if (ps.last_complete_ondisk != ps.info.last_complete)
     out << " lcod " << ps.last_complete_ondisk;
 
-  out << " mlcod " << ps.min_last_complete_ondisk;
+  if (ps.is_primary())
+    out << " mlcod " << ps.min_last_complete_ondisk;
 
   out << " " << pg_state_string(ps.get_state());
   if (ps.should_send_notify())
index 0a7b30823e60f3b1b045b8e38fc3e3c2f761b466..234a24c7a75832a0c87797d786d61a809988b56b 100644 (file)
@@ -1924,7 +1924,7 @@ public:
     std::vector<pg_log_entry_t>&& logv,
     eversion_t trim_to,
     eversion_t roll_forward_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     ObjectStore::Transaction &t,
     bool transaction_applied,
     bool async);
index 7daa648d72c7ea91fbe44563791e682462f15b29..3ad7d41f16bb40005dcfc03333ef31265e0ffde4 100644 (file)
@@ -497,7 +497,7 @@ public:
     const std::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
     const eversion_t &roll_forward_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     bool transaction_applied,
     ObjectStore::Transaction &t,
     bool async = false) override {
@@ -519,7 +519,7 @@ public:
       replica_clear_repop_obc(logv, t);
     }
     recovery_state.append_log(
-      std::move(logv), trim_to, roll_forward_to, min_last_complete_ondisk,
+      std::move(logv), trim_to, roll_forward_to, pg_committed_to,
       t, transaction_applied, async);
   }
 
index a2895511dffaba16d366393f67e42d1e61039454..beb379ca0594602987e6b31552359257cb36e6e2 100644 (file)
@@ -468,7 +468,7 @@ void ReplicatedBackend::submit_transaction(
   const eversion_t &at_version,
   PGTransactionUPtr &&_t,
   const eversion_t &trim_to,
-  const eversion_t &min_last_complete_ondisk,
+  const eversion_t &pg_committed_to,
   vector<pg_log_entry_t>&& _log_entries,
   std::optional<pg_hit_set_history_t> &hset_history,
   Context *on_all_commit,
@@ -517,7 +517,7 @@ void ReplicatedBackend::submit_transaction(
     tid,
     reqid,
     trim_to,
-    min_last_complete_ondisk,
+    pg_committed_to,
     added.size() ? *(added.begin()) : hobject_t(),
     removed.size() ? *(removed.begin()) : hobject_t(),
     log_entries,
@@ -533,7 +533,7 @@ void ReplicatedBackend::submit_transaction(
     hset_history,
     trim_to,
     at_version,
-    min_last_complete_ondisk,
+    pg_committed_to,
     true,
     op_t);
   
@@ -953,7 +953,7 @@ Message * ReplicatedBackend::generate_subop(
   ceph_tid_t tid,
   osd_reqid_t reqid,
   eversion_t pg_trim_to,
-  eversion_t min_last_complete_ondisk,
+  eversion_t pg_committed_to,
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   const bufferlist &log_entries,
@@ -992,7 +992,7 @@ Message * ReplicatedBackend::generate_subop(
 
   // this feature is from 2019 (6f12bf27cb91), assume present
   ceph_assert(HAVE_FEATURE(parent->min_peer_features(), OSD_REPOP_MLCOD));
-  wr->min_last_complete_ondisk = min_last_complete_ondisk;
+  wr->pg_committed_to = pg_committed_to;
 
   wr->new_temp_oid = new_temp_oid;
   wr->discard_temp_oid = discard_temp_oid;
@@ -1006,7 +1006,7 @@ void ReplicatedBackend::issue_op(
   ceph_tid_t tid,
   osd_reqid_t reqid,
   eversion_t pg_trim_to,
-  eversion_t min_last_complete_ondisk,
+  eversion_t pg_committed_to,
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   const vector<pg_log_entry_t> &log_entries,
@@ -1039,7 +1039,7 @@ void ReplicatedBackend::issue_op(
          tid,
          reqid,
          pg_trim_to,
-         min_last_complete_ondisk,
+         pg_committed_to,
          new_temp_oid,
          discard_temp_oid,
          logs,
@@ -1141,7 +1141,7 @@ void ReplicatedBackend::do_repop(OpRequestRef op)
     m->updated_hit_set_history,
     m->pg_trim_to,
     m->version, /* Replicated PGs don't have rollback info */
-    m->min_last_complete_ondisk,
+    m->pg_committed_to,
     update_snaps,
     rm->localt,
     async);
index aab75d21c73727bef94fadb4a94ba16b5bfcde13..2f3c1ea2509e9d8fa5dc5cb7439bf8c2d37d496a 100644 (file)
@@ -356,7 +356,7 @@ public:
     const eversion_t &at_version,
     PGTransactionUPtr &&t,
     const eversion_t &trim_to,
-    const eversion_t &min_last_complete_ondisk,
+    const eversion_t &pg_committed_to,
     std::vector<pg_log_entry_t>&& log_entries,
     std::optional<pg_hit_set_history_t> &hset_history,
     Context *on_all_commit,
@@ -372,7 +372,7 @@ private:
     ceph_tid_t tid,
     osd_reqid_t reqid,
     eversion_t pg_trim_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     const ceph::buffer::list &log_entries,
@@ -386,7 +386,7 @@ private:
     ceph_tid_t tid,
     osd_reqid_t reqid,
     eversion_t pg_trim_to,
-    eversion_t min_last_complete_ondisk,
+    eversion_t pg_committed_to,
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     const std::vector<pg_log_entry_t> &log_entries,