]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Refactor partial_write to address multiple issues.
authorAlex Ainscow <aainscow@uk.ibm.com>
Tue, 29 Apr 2025 11:02:07 +0000 (12:02 +0100)
committerLaura Flores <lflores@ibm.com>
Wed, 9 Jul 2025 15:47:24 +0000 (15:47 +0000)
We fix a number of issues with partial_write here.

Fix an issue where it is unclear whether the empty PWLC state is
newer or older than a populated PWLC on another shard by always
updating the pwlc with an empty range, rather than blank.

This is an unfortunate small increase in metadata, so we should
come back to this in a later commit (or possibly later PR).

Normally a PG log consists of a set of log entries with each
log entry have a version number one greater than the previous
entry. When a PG splits the PG log is split so that each of the
new PGs only has log entries for objects in that PG, which
means there can be gaps between version numbers.

PGBackend::partial_write is trying to keep track of adjacent
log updates than do not update a particular shard storing
these as a range in partial_writes_last_complete. To do this
it must compare with the version number of the previous log
entry rather than testing for a version number increment of one.

Also simplify partial_writes to make it more readable.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
(cherry picked from commit 2467406f3e22c0746ce20cd04b838dccedadf055)

src/crimson/osd/pg.h
src/osd/PG.h
src/osd/PGBackend.cc
src/osd/PGBackend.h
src/osd/PGLog.h
src/test/osd/TestPGLog.cc

index 6384f93f72d5b026d756c1b2505df0ef11727d8a..dd35b7f0ecd61cb6f2d8533cb3e827a5678e106a 100644 (file)
@@ -478,9 +478,13 @@ public:
     void trim(const pg_log_entry_t &entry) override {
       // TODO
     }
-    void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
+    void partial_write(pg_info_t *info,
+                       eversion_t previous_version,
+                       const pg_log_entry_t &entry
+      ) override {
       // TODO
-      ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty());
+      ceph_assert(entry.written_shards.empty() &&
+                  info->partial_writes_last_complete.empty());
     }
   };
   PGLog::LogEntryHandlerRef get_log_handler(
index 87c52f5bb723281c30c86c2eb12fbba65a8b6b34..f3e760c1b8fd154c736bccc1838a79f9452b4e91 100644 (file)
@@ -1144,8 +1144,10 @@ protected:
     void trim(const pg_log_entry_t &entry) override {
       pg->get_pgbackend()->trim(entry, t);
     }
-    void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
-      pg->get_pgbackend()->partial_write(info, entry);
+    void partial_write(pg_info_t *info, eversion_t previous_version,
+                       const pg_log_entry_t &entry
+      ) override {
+      pg->get_pgbackend()->partial_write(info, previous_version, entry);
     }
   };
 
index ba492a69974ddc3e0fa0b17f6d48f21a332a8402..b199c9248dcbc62a1ed6a0bce6c50c7d0a3c6730 100644 (file)
@@ -413,51 +413,53 @@ void PGBackend::try_stash(
 
 void PGBackend::partial_write(
    pg_info_t *info,
+   eversion_t previous_version,
    const pg_log_entry_t &entry)
 {
   ceph_assert(info != nullptr);
+  if (entry.written_shards.empty() && info->partial_writes_last_complete.empty()) {
+    return;
+  }
   auto dpp = get_parent()->get_dpp();
-  if (!entry.written_shards.empty()) {
-    ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
-                      << " written_shards=" << entry.written_shards
-                      << " present_shards=" << entry.present_shards
-                      << " pwlc=" << info->partial_writes_last_complete
-                      << dendl;
-    const pg_pool_t &pool = get_parent()->get_pool();
-    for (unsigned int shard = 0;
-        shard < get_parent()->get_pool().size;
-        shard++) {
-      if (pool.is_nonprimary_shard(shard_id_t(shard))) {
-        if (!entry.is_written_shard(shard_id_t(shard))) {
-         if (!info->partial_writes_last_complete.contains(shard_id_t(shard))) {
-           // 1st partial write since all logs were updated
-           info->partial_writes_last_complete[shard_id_t(shard)] =
-             std::pair(entry.prior_version, entry.version);
-         } else if (info->partial_writes_last_complete[shard_id_t(shard)]
-                    .second.version + 1 == entry.version.version) {
-           // Subsequent partial write, version is sequential
-           info->partial_writes_last_complete[shard_id_t(shard)].second =
-             entry.version;
-         } else {
-           // Subsequent partial write, discontiguous versions
-           ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
-                              << dendl;
-         }
-        } else {
-         // Log updated or shard absent, partial write entry not required
-          info->partial_writes_last_complete.erase(shard_id_t(shard));
-       }
+  ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
+                    << " written_shards=" << entry.written_shards
+                    << " present_shards=" << entry.present_shards
+                    << " pwlc=" << info->partial_writes_last_complete
+                    << " previous_version=" << previous_version
+                    << dendl;
+  const pg_pool_t &pool = get_parent()->get_pool();
+  for (shard_id_t shard : pool.nonprimary_shards) {
+    auto pwlc_iter = info->partial_writes_last_complete.find(shard);
+    if (!entry.is_written_shard(shard)) {
+      if (pwlc_iter == info->partial_writes_last_complete.end()) {
+       // 1st partial write since all logs were updated
+       info->partial_writes_last_complete[shard] =
+         std::pair(previous_version, entry.version);
+
+       continue;
       }
+      auto &&[old_v,  new_v] = pwlc_iter->second;
+      if (old_v == new_v) {
+       old_v = previous_version;
+       new_v = entry.version;
+      } else if (new_v == previous_version) {
+       // Subsequent partial write, contiguous versions
+       new_v = entry.version;
+      } else {
+       // Subsequent partial write, discontiguous versions
+       ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
+                          << dendl;
+      }
+    } else if (pwlc_iter != info->partial_writes_last_complete.end()) {
+      auto &&[old_v,  new_v] = pwlc_iter->second;
+      // Log updated or shard absent, partial write entry is a no-op
+      // FIXME: In a later commit (or later PR) we should look at other ways of
+      //        actually clearing the PWLC once all shards have seen the update.
+      old_v = new_v = entry.version;
     }
-    ldpp_dout(dpp, 20) << __func__ << " after pwlc="
-                      << info->partial_writes_last_complete << dendl;
-  } else {
-    // All shard updated - clear partial write data
-    if (!info->partial_writes_last_complete.empty()) {
-      ldpp_dout(dpp, 20) << __func__ << " clear pwlc" << dendl;
-    }
-    info->partial_writes_last_complete.clear();
   }
+  ldpp_dout(dpp, 20) << __func__ << " after pwlc="
+                    << info->partial_writes_last_complete << dendl;
 }
 
 void PGBackend::remove(
index efe2f173cf5fc116b17f6b7a23d04ff3102507a7..acbff0745b23ef9f27d36a34dd1c12de78e8daf3 100644 (file)
@@ -504,6 +504,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
 
    void partial_write(
      pg_info_t *info,
+     eversion_t previous_version,
      const pg_log_entry_t &entry);
 
    void remove(
index 329a968509a4a6e60cfd6a2bafc031b1256dbb98..be7e97381f2d19893361017877a58f129f19629a 100644 (file)
@@ -152,8 +152,10 @@ struct PGLog : DoutPrefixProvider {
       const hobject_t &hoid,
       version_t v) = 0;
     virtual void partial_write(
-      pg_info_t *info,
-      const pg_log_entry_t &entry) = 0;
+        pg_info_t *info,
+        eversion_t previous_version,
+        const pg_log_entry_t &entry
+      ) = 0;
     virtual ~LogEntryHandler() {}
   };
   using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
@@ -200,13 +202,20 @@ public:
          rollback_info_trimmed_to = to;
       }
 
+      eversion_t previous_version;
+      if (rollback_info_trimmed_to_riter == log.rend()) {
+       previous_version = tail;
+      } else {
+       previous_version = rollback_info_trimmed_to_riter->version;
+      }
       while (rollback_info_trimmed_to_riter != log.rbegin()) {
        --rollback_info_trimmed_to_riter;
        if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
          ++rollback_info_trimmed_to_riter;
          break;
        }
-       f(*rollback_info_trimmed_to_riter);
+       f(*rollback_info_trimmed_to_riter, previous_version);
+       previous_version = rollback_info_trimmed_to_riter->version;
       }
 
       return dirty_log;
@@ -260,30 +269,31 @@ public:
     void trim_rollback_info_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
       advance_can_rollback_to(
        to,
-       [&](pg_log_entry_t &entry) {
+       [&](pg_log_entry_t &entry, eversion_t previous_version) {
          h->trim(entry);
-         h->partial_write(info, entry);
+         h->partial_write(info, previous_version, entry);
        });
     }
     bool roll_forward_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
       return advance_can_rollback_to(
        to,
-       [&](pg_log_entry_t &entry) {
+       [&](pg_log_entry_t &entry, eversion_t previous_version) {
          h->rollforward(entry);
-         h->partial_write(info, entry);
+         h->partial_write(info, previous_version, entry);
        });
     }
 
     void skip_can_rollback_to_to_head(pg_info_t *info, LogEntryHandler *h) {
       advance_can_rollback_to(
        head,
-        [&](pg_log_entry_t &entry) {
-         h->partial_write(info, entry);
+        [&](pg_log_entry_t &entry, eversion_t previous_version) {
+         h->partial_write(info, previous_version, entry);
        });
     }
 
     void skip_can_rollback_to_to_head() {
-      advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
+      advance_can_rollback_to(head, [&](const pg_log_entry_t &entry,
+                                       eversion_t previous_version) {});
     }
 
     mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
index ef6b8758dd3584f39f99c9a8b495676ad8e9ada1..ff275b92bc635659eb1a96cd0266f4e24d7abaef 100644 (file)
@@ -234,8 +234,10 @@ public:
     void trim(
       const pg_log_entry_t &entry) override {}
     void partial_write(
-      pg_info_t *info,
-      const pg_log_entry_t &entry) override {}
+        pg_info_t *info,
+        eversion_t previous_version,
+        const pg_log_entry_t &entry
+      ) override {}
   };
 
   template <typename missing_t>
@@ -360,8 +362,10 @@ struct TestHandler : public PGLog::LogEntryHandler {
   void trim(
     const pg_log_entry_t &entry) override {}
   void partial_write(
-    pg_info_t *info,
-    const pg_log_entry_t &entry) override {}
+      pg_info_t *info,
+      eversion_t previous_version,
+      const pg_log_entry_t &entry
+    ) override {}
 };
 
 TEST_F(PGLogTest, rewind_divergent_log) {