]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations fix proc_master_log handling of splits
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 21 May 2025 17:16:50 +0000 (18:16 +0100)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 1 Jul 2025 11:57:50 +0000 (12:57 +0100)
For optimized EC pools proc_master_log needs to deal with
the other log being merged being behind the local log because
it is missing partial writes. This is done by finding the
point where the logs diverge and then checking whether local
log entries have been committed on all the shards.

A bug in this code meant that after a PG split (where there
may be gaps in the log due to entries moving to the other PG)
that the divergence point was not found and committed
partial writes ended up being discarded which creates
unfound objects.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PeeringState.cc

index 048224f6ec7c3d2bcba13f3cd6090794aef254e9..959f5cb023e2133fe9f67de3dacb644ee5c2fec7 100644 (file)
@@ -3327,7 +3327,22 @@ void PeeringState::proc_master_log(
     // See if we can wind forward partially written entries
     map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
     all_info[pg_whoami] = info;
-    while (p->version == olog.head) {
+    // Normal case is that both logs have entry olog.head
+    bool can_check_next_entry = (p->version == olog.head);
+    if (p->version < olog.head) {
+      // After a PG split there may be gaps in the log where entries were
+      // split to the other PG. This can result in olog.head being ahead
+      // of p->version. So long as there are no entries in olog between
+      // p->version and olog.head we can still try to wind forward
+      // partially written entries
+      auto op = olog.log.end();
+      if (op == olog.log.begin()) {
+       can_check_next_entry = true;
+      } else if (op->version.version < p->version.version) {
+       can_check_next_entry = true;
+      }
+    }
+    while (can_check_next_entry) {
       ++p;
       if (p == pg_log.get_log().log.end()) {
        break;
@@ -3338,39 +3353,40 @@ void PeeringState::proc_master_log(
        // This entry was meant to be written on from, this is the first
        // divergent entry
        break;
-      } else {
-       // Test if enough shards have the update
-       shard_id_set shards_with_update;
-       shard_id_set shards_without_update;
-        for (auto&& [pg_shard, pi] : all_info) {
-         psdout(20) << "version " << p->version
-                    << " testing osd " << pg_shard
-                    << " written=" << p->written_shards
-                    << " present=" << p->present_shards << dendl;
-         if (p->is_present_shard(pg_shard.shard) &&
-             p->is_written_shard(pg_shard.shard)) {
-           if (pi.last_update < p->version) {
-             if (!shards_with_update.contains(pg_shard.shard)) {
-               shards_without_update.insert(pg_shard.shard);
-             }
-           } else {
-             shards_with_update.insert(pg_shard.shard);
-             shards_without_update.erase(pg_shard.shard);
+      }
+      // Test if enough shards have the update
+      shard_id_set shards_with_update;
+      shard_id_set shards_without_update;
+      for (auto&& [pg_shard, pi] : all_info) {
+       psdout(20) << "version " << p->version
+                  << " testing osd " << pg_shard
+                  << " written=" << p->written_shards
+                  << " present=" << p->present_shards << dendl;
+       if (p->is_present_shard(pg_shard.shard) &&
+           p->is_written_shard(pg_shard.shard)) {
+         if (pi.last_update < p->version) {
+           if (!shards_with_update.contains(pg_shard.shard)) {
+             shards_without_update.insert(pg_shard.shard);
            }
+         } else {
+           shards_with_update.insert(pg_shard.shard);
+           shards_without_update.erase(pg_shard.shard);
          }
        }
-       psdout(20) << "shards_with_update=" << shards_with_update
-                  << " shards_without_update=" << shards_without_update
-                  << dendl;
-       if (!shards_without_update.empty()) {
-         // A shard is missing this write - this is the first divergent entry
-         break;
-       }
-       // This entry can be kept, only shards that didn't participate in
-       // the partial write missed the update
-        psdout(20) << "keeping entry " << p->version << dendl;
-       olog.head = p->version;
       }
+      psdout(20) << "shards_with_update=" << shards_with_update
+                << " shards_without_update=" << shards_without_update
+                << dendl;
+      if (!shards_without_update.empty()) {
+       // A shard is missing this write - this is the first divergent entry
+       break;
+      }
+      // This entry can be kept, only shards that didn't participate in
+      // the partial write missed the update
+      psdout(20) << "keeping entry " << p->version << dendl;
+      olog.head = p->version;
+
+      // We need to continue processing the log, so don't break.
     }
   }
   // merge log into our own log to build master log.  no need to