]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: Peer changes for partial logs
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 26 Mar 2025 13:25:07 +0000 (13:25 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:17:03 +0000 (08:17 +0100)
Changes to peering for replica/strays to handle partial
logs. For EC optimized pools shards may not have a complete
log if there have been partial writes that did not update
the shard. If the most recent entries in the log have all
skipped updating a shard then it will have a log that ends
earlier than other shards. During peering the primary which
has a full copy of the log works out whether other shards
have any missing objects and then communicates this to
the replica/stray shards during activation.

The primary uses the partial write last complete data in
pg_info_t to explain to other shards if they are missing
log entries and just need to update last_update and
last_complete.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PeeringState.cc

index b8bc669473fedfaf0fc51ab99d74078d8e0c70cd..47bb4ec79c46553a12f767d6aa4d9537285b85ee 100644 (file)
@@ -6691,7 +6691,35 @@ boost::statechart::result PeeringState::ReplicaActive::react(const MLogRec& loge
 {
   DECLARE_LOCALS;
   psdout(10) << "received log from " << logevt.from << dendl;
+  MOSDPGLog *msg = logevt.msg.get();
   ObjectStore::Transaction &t = context<PeeringMachine>().get_cur_transaction();
+  if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) {
+    // Check if last_complete and last_update can be advanced based on
+    // knowledge of partial_writes
+    const auto & [fromversion, toversion] =
+      msg->info.partial_writes_last_complete[ps->pg_whoami.shard];
+    if (toversion > ps->info.last_complete) {
+      if (fromversion <= ps->info.last_complete) {
+       psdout(10) << "last_complete " << ps->info.last_complete
+                  << " but pwlc from " << logevt.from
+                  << " is at " << toversion << dendl;
+       ps->info.last_complete = toversion;
+       if (toversion > ps->info.last_update) {
+         ps->info.last_update = toversion;
+       }
+       // Advance head to avoid an assert in merge log
+       if (msg->log.tail > ps->pg_log.get_head()) {
+         psdout(10) << "pwlc advancing log head from "
+                   << ps->pg_log.get_head() << " to " << toversion << dendl;
+         ps->pg_log.set_head(toversion);
+       }
+      } else {
+       psdout(10) << "last_complete " << ps->info.last_complete
+                  << " cannot apply pwlc from "
+                  << fromversion << " to " << toversion << dendl;
+      }
+    }
+  }
   ps->merge_log(t, logevt.msg->info, std::move(logevt.msg->log), logevt.from);
   ps->update_peer_info(logevt.from, logevt.msg->info);
   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
@@ -6806,6 +6834,34 @@ boost::statechart::result PeeringState::Stray::react(const MLogRec& logevt)
 
     ps->pg_log.reset_backfill();
   } else {
+    if (msg->info.partial_writes_last_complete.contains(ps->pg_whoami.shard)) {
+      // Check if last_complete and last_update can be advanced based on
+      // knowledge of partial_writes
+      const auto & [fromversion, toversion] =
+       msg->info.partial_writes_last_complete[ps->pg_whoami.shard];
+      if (toversion > ps->info.last_complete) {
+       if (fromversion <= ps->info.last_complete) {
+         psdout(10) << "last_complete " << ps->info.last_complete
+                    << " but pwlc from " << logevt.from
+                    << " is at " << toversion << dendl;
+         ps->info.last_complete = toversion;
+         if (toversion > ps->info.last_update) {
+           ps->info.last_update = toversion;
+         }
+         // Need to do this to avoid an assert in merge log
+         if (msg->log.tail > ps->pg_log.get_head()) {
+           psdout(10) << "pwlc advancing log head from "
+                      << ps->pg_log.get_head() << " to " << toversion
+                      << dendl;
+           ps->pg_log.set_head(toversion);
+         }
+       } else {
+         psdout(10) << "last_complete " << ps->info.last_complete
+                    << " cannot apply pwlc from "
+                    << fromversion << " to " << toversion << dendl;
+       }
+      }
+    }
     ps->merge_log(t, msg->info, std::move(msg->log), logevt.from);
     ps->update_peer_info(logevt.from, msg->info);
   }
@@ -6836,7 +6892,34 @@ boost::statechart::result PeeringState::Stray::react(const MInfoRec& infoevt)
     ps->proc_lease(*infoevt.lease);
   }
 
-  ceph_assert(infoevt.info.last_update == ps->info.last_update);
+  if (infoevt.info.last_update > ps->info.last_update) {
+    // Log is missing entries, this is only allowed if the
+    // missing entries are all partial writes that did not
+    // update this shard
+
+    // Must be a non-primary shard (which implies it is an EC pool
+    // with ec_optimizations_main set)
+    ceph_assert(ps->pool.info.is_nonprimary_shard(ps->pg_whoami.shard));
+    // There must be a partial write last_complete entry for this shard
+    ceph_assert(infoevt.info.partial_writes_last_complete.contains(
+                                                  ps->pg_whoami.shard));
+    auto pwlc = infoevt.info.partial_writes_last_complete.at(
+                                                  ps->pg_whoami.shard);
+    psdout(20) << "info from osd." << infoevt.from
+              << " last_update=" << infoevt.info.last_update
+              << " last_complete=" << infoevt.info.last_complete
+              << " pwlc=" << pwlc
+              << " our last_update=" << ps->info.last_update << dendl;
+    // Our last update must be in the range described by partial write
+    // last_complete
+    ceph_assert(ps->info.last_update >= pwlc.first);
+    // Last complete must match the partial write last_update
+    ceph_assert(pwlc.second == infoevt.info.last_update);
+  } else {
+    // Log must match after any divergent entries were rewound
+    ceph_assert(infoevt.info.last_update == ps->info.last_update);
+  }
+  // Log must be consistent with info
   ceph_assert(ps->pg_log.get_head() == ps->info.last_update);
   // Update pwlc
   ps->update_peer_info(infoevt.from, infoevt.info);