]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: Update pwlc in pg_info_t
authorBill Scales <bill_scales@uk.ibm.com>
Tue, 25 Mar 2025 17:41:57 +0000 (17:41 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:04:24 +0000 (08:04 +0100)
Optimized EC pools add extra data to the log entry to track
which shards were updated by a partial write. When the log
entry is completed this needs to be summarized in the
partial_writes_last_complete map in pg_info_t.

Summarising this data in pg_info_t makes it easy to determine
whether the reason a shard is behind is because it is missing
update or has just not been involved in recent updates. This
also ensures that even if there is a long sequence of
updates that all skip updating a shard that a record of this
is retained in the info structure even after the log
has been trimmed.

Edited by aainscow as suggested in comment here:
https://github.com/ceph/ceph/pull/62522/files#r2050803678

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/crimson/osd/pg.h
src/osd/PG.h
src/osd/PGBackend.cc
src/osd/PGBackend.h
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/PeeringState.cc
src/test/osd/TestPGLog.cc

index 0a20f2c4d6241cd31d266d384749bfa1c0fbe863..1ea50542dfcef0feed8afb251bb09ac0f98d9fa5 100644 (file)
@@ -477,6 +477,10 @@ public:
     void trim(const pg_log_entry_t &entry) override {
       // TODO
     }
+    void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
+      // TODO
+      ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty());
+    }
   };
   PGLog::LogEntryHandlerRef get_log_handler(
     ceph::os::Transaction &t) final {
index 28b0e5937efec45d2bdfc48cd000606a4425da45..907c50a6692821886e1502118baca984fa263f04 100644 (file)
@@ -1127,6 +1127,9 @@ protected:
     void trim(const pg_log_entry_t &entry) override {
       pg->get_pgbackend()->trim(entry, t);
     }
+    void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
+      pg->get_pgbackend()->partial_write(info, entry);
+    }
   };
 
   void update_object_snap_mapping(
index e91614192f5c23d75f311d0d9b51181e5d63f79f..19ac1ab70ea66e16cdede28ca0ed0f6e9feaf6d7 100644 (file)
@@ -411,6 +411,55 @@ void PGBackend::try_stash(
     ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
 }
 
+void PGBackend::partial_write(
+   pg_info_t *info,
+   const pg_log_entry_t &entry)
+{
+  ceph_assert(info != nullptr);
+  auto dpp = get_parent()->get_dpp();
+  if (!entry.written_shards.empty()) {
+    ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
+                      << " written_shards=" << entry.written_shards
+                      << " present_shards=" << entry.present_shards
+                      << " pwlc=" << info->partial_writes_last_complete
+                      << dendl;
+    const pg_pool_t &pool = get_parent()->get_pool();
+    for (unsigned int shard = 0;
+        shard < get_parent()->get_pool().size;
+        shard++) {
+      if (pool.is_nonprimary_shard(shard_id_t(shard))) {
+        if (!entry.is_written_shard(shard_id_t(shard))) {
+         if (!info->partial_writes_last_complete.contains(shard_id_t(shard))) {
+           // 1st partial write since all logs were updated
+           info->partial_writes_last_complete[shard_id_t(shard)] =
+             std::pair(entry.prior_version, entry.version);
+         } else if (info->partial_writes_last_complete[shard_id_t(shard)]
+                    .second.version + 1 == entry.version.version) {
+           // Subsequent partial write, version is sequential
+           info->partial_writes_last_complete[shard_id_t(shard)].second =
+             entry.version;
+         } else {
+           // Subsequent partial write, discontiguous versions
+           ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
+                              << dendl;
+         }
+        } else {
+         // Log updated or shard absent, partial write entry not required
+          info->partial_writes_last_complete.erase(shard_id_t(shard));
+       }
+      }
+    }
+    ldpp_dout(dpp, 20) << __func__ << " after pwlc="
+                      << info->partial_writes_last_complete << dendl;
+  } else {
+    // All shard updated - clear partial write data
+    if (!info->partial_writes_last_complete.empty()) {
+      ldpp_dout(dpp, 20) << __func__ << " clear pwlc" << dendl;
+    }
+    info->partial_writes_last_complete.clear();
+  }
+}
+
 void PGBackend::remove(
   const hobject_t &hoid,
   ObjectStore::Transaction *t) {
index 32ca5f17820c9086832ba23a392a8b6ee8667b49..4e5be28b637a25e44f25ba90e85f529f80325fa9 100644 (file)
@@ -491,6 +491,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
      const pg_log_entry_t &entry,
      ObjectStore::Transaction *t);
 
+   void partial_write(
+     pg_info_t *info,
+     const pg_log_entry_t &entry);
+
    void remove(
      const hobject_t &hoid,
      ObjectStore::Transaction *t);
index 9e7c0405a63e05a67e2010c33ef14698d3da12df..5831cef8c2b612d47b7120ecc20060a1538c791f 100644 (file)
@@ -465,7 +465,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
     for (auto &&oe: divergent) {
       dout(10) << "merge_log divergent " << oe << dendl;
     }
-    log.roll_forward_to(log.head, rollbacker);
+    log.roll_forward_to(log.head, &info, rollbacker);
 
     mempool::osd_pglog::list<pg_log_entry_t> new_entries;
     new_entries.splice(new_entries.end(), olog.log, from, to);
index 10c7fc0e7ea7bf149d85546770bf5dabd6780e9f..4d899f2356b38ab0da3d115faccfbcf7e0192776 100644 (file)
@@ -146,6 +146,9 @@ struct PGLog : DoutPrefixProvider {
     virtual void try_stash(
       const hobject_t &hoid,
       version_t v) = 0;
+    virtual void partial_write(
+      pg_info_t *info,
+      const pg_log_entry_t &entry) = 0;
     virtual ~LogEntryHandler() {}
   };
   using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
@@ -249,18 +252,28 @@ public:
       return *this;
     }
 
-    void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) {
+    void trim_rollback_info_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
       advance_can_rollback_to(
        to,
        [&](pg_log_entry_t &entry) {
          h->trim(entry);
+         h->partial_write(info, entry);
        });
     }
-    bool roll_forward_to(eversion_t to, LogEntryHandler *h) {
+    bool roll_forward_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
       return advance_can_rollback_to(
        to,
        [&](pg_log_entry_t &entry) {
          h->rollforward(entry);
+         h->partial_write(info, entry);
+       });
+    }
+
+    void skip_can_rollback_to_to_head(pg_info_t *info, LogEntryHandler *h) {
+      advance_can_rollback_to(
+       head,
+        [&](pg_log_entry_t &entry) {
+         h->partial_write(info, entry);
        });
     }
 
@@ -826,9 +839,11 @@ public:
 
   void roll_forward_to(
     eversion_t roll_forward_to,
+    pg_info_t *info,
     LogEntryHandler *h) {
     if (log.roll_forward_to(
          roll_forward_to,
+         info,
          h))
       dirty_log = true;
   }
@@ -837,20 +852,22 @@ public:
     return log.get_can_rollback_to();
   }
 
-  void roll_forward(LogEntryHandler *h) {
+  void roll_forward(pg_info_t *info, LogEntryHandler *h) {
     roll_forward_to(
       log.head,
+      info,
       h);
   }
 
-  void skip_rollforward() {
-    log.skip_can_rollback_to_to_head();
+  void skip_rollforward(pg_info_t *info, LogEntryHandler *h) {
+    // Update pwlc during backfill
+    log.skip_can_rollback_to_to_head(info, h);
   }
 
   //////////////////// get or std::set log & missing ////////////////////
 
-  void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) {
-    log.trim_rollback_info_to(log.head, h);
+  void reset_backfill_claim_log(const pg_log_t &o, pg_info_t *info, LogEntryHandler *h) {
+    log.trim_rollback_info_to(log.head, info, h);
     log.claim_log_and_clear_rollback_info(o);
     missing.clear();
     mark_dirty_to(eversion_t::max());
index 4dd7543e10ac91714255dcbf56597bd966526b9c..e3d10e4024e8e5b473a3f498a295f0eba2fabe5e 100644 (file)
@@ -3012,7 +3012,7 @@ void PeeringState::activate(
   }
   if (acting_set_writeable()) {
     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
-    pg_log.roll_forward(rollbacker.get());
+    pg_log.roll_forward(&info, rollbacker.get());
   }
 }
 
@@ -3398,7 +3398,7 @@ void PeeringState::merge_from(
   }
 
   PGLog::LogEntryHandlerRef handler{pl->get_log_handler(rctx.transaction)};
-  pg_log.roll_forward(handler.get());
+  pg_log.roll_forward(&info, handler.get());
 
   info.last_complete = info.last_update;  // to fake out trim()
   pg_log.reset_recovery_pointers();
@@ -3437,7 +3437,7 @@ void PeeringState::merge_from(
     // prepare log
     PGLog::LogEntryHandlerRef handler{
       source->pl->get_log_handler(rctx.transaction)};
-    source->pg_log.roll_forward(handler.get());
+    source->pg_log.roll_forward(&info, handler.get());
     source->info.last_complete = source->info.last_update;  // to fake out trim()
     source->pg_log.reset_recovery_pointers();
     source->pg_log.trim(source->info.last_update, source->info);
@@ -4116,10 +4116,10 @@ bool PeeringState::append_log_entries_update_missing(
       rollbacker.get());
 
   if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) {
-    pg_log.roll_forward(rollbacker.get());
+    pg_log.roll_forward(&info, rollbacker.get());
   }
   if (pg_committed_to && *pg_committed_to > pg_log.get_can_rollback_to()) {
-    pg_log.roll_forward_to(*pg_committed_to, rollbacker.get());
+    pg_log.roll_forward_to(*pg_committed_to, &info, rollbacker.get());
     last_rollback_info_trimmed_to_applied = *pg_committed_to;
   }
 
@@ -4247,7 +4247,7 @@ void PeeringState::append_log(
       * from the backend and we do not end up in a situation, where the
       * object is deleted before we can _merge_object_divergent_entries().
       */
-    pg_log.skip_rollforward();
+    pg_log.skip_rollforward(&info, handler.get());
   }
 
   for (auto p = logv.begin(); p != logv.end(); ++p) {
@@ -4258,12 +4258,13 @@ void PeeringState::append_log(
      * above */
     if (transaction_applied &&
        p->soid > info.last_backfill) {
-      pg_log.roll_forward(handler.get());
+      pg_log.roll_forward(&info, handler.get());
     }
   }
   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
     pg_log.roll_forward_to(
       roll_forward_to,
+      &info,
       handler.get());
     last_rollback_info_trimmed_to_applied = roll_forward_to;
   }
@@ -4299,7 +4300,7 @@ void PeeringState::recover_got(
      * to roll it back anyway (and we'll be rolled forward shortly
      * anyway) */
     PGLog::LogEntryHandlerRef handler{pl->get_log_handler(t)};
-    pg_log.roll_forward_to(v, handler.get());
+    pg_log.roll_forward_to(v, &info, handler.get());
   }
 
   psdout(10) << "got missing " << oid << " v " << v << dendl;
@@ -6645,7 +6646,7 @@ boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
     ps->dirty_big_info = true;  // maybe.
 
     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
-    ps->pg_log.reset_backfill_claim_log(msg->log, rollbacker.get());
+    ps->pg_log.reset_backfill_claim_log(msg->log, &ps->info, rollbacker.get());
 
     ps->pg_log.reset_backfill();
   } else {
@@ -6794,7 +6795,7 @@ PeeringState::Deleting::Deleting(my_context ctx)
 
   // clear log
   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
-  ps->pg_log.roll_forward(rollbacker.get());
+  ps->pg_log.roll_forward(&ps->info, rollbacker.get());
 
   // adjust info to backfill
   ps->info.set_last_backfill(hobject_t());
index 1fff469d1fe5e0cf1b697578efcf5d26309946bb..5b85e0f9d2e880a1fd50c4770f3d39fcdd79d8ac 100644 (file)
@@ -233,6 +233,9 @@ public:
     }
     void trim(
       const pg_log_entry_t &entry) override {}
+    void partial_write(
+      pg_info_t *info,
+      const pg_log_entry_t &entry) override {}
   };
 
   template <typename missing_t>
@@ -356,6 +359,9 @@ struct TestHandler : public PGLog::LogEntryHandler {
   }
   void trim(
     const pg_log_entry_t &entry) override {}
+  void partial_write(
+    pg_info_t *info,
+    const pg_log_entry_t &entry) override {}
 };
 
 TEST_F(PGLogTest, rewind_divergent_log) {
@@ -530,11 +536,11 @@ TEST_F(PGLogTest, rewind_divergent_log) {
       add(e);
     }
     TestHandler h(remove_snap);
-    roll_forward_to(eversion_t(1, 6), &h);
+    roll_forward_to(eversion_t(1, 6), &info, &h);
     rewind_divergent_log(eversion_t(1, 5), info, &h,
                         dirty_info, dirty_big_info);
     pg_log_t log;
-    reset_backfill_claim_log(log, &h);
+    reset_backfill_claim_log(log, &info, &h);
   }
 }