]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: proc_master_log changes for partial logs 62523/head
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 26 Mar 2025 13:43:43 +0000 (13:43 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:17:03 +0000 (08:17 +0100)
proc_master_log is part of the peering process that merges
the authorative log (in the case of EC pools the log of the
shard missing the most updates) into the primary log.

When there are partial writes it is likely that the
authorative log is behind because of partial writes that
did not update that shard. proc_master_log works out where
the logs diverge and then studies each additional log entry
to see if all the updates made in that log entry have been
applied. If any shard is missing an update then that log
entry (and all subsequent entries) need to be rolled back,
otherwise the entry can be rolled forward and included in
the authorative log.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PeeringState.cc
src/osd/PeeringState.h

index 47bb4ec79c46553a12f767d6aa4d9537285b85ee..30fb2bf38cbbea4849412afee2c786092beb08c1 100644 (file)
@@ -3214,6 +3214,32 @@ void PeeringState::proc_primary_info(
   }
 }
 
+void PeeringState::consider_rollback_pwlc(eversion_t last_complete)
+{
+  for (const auto & [shard, versionrange] :
+        info.partial_writes_last_complete) {
+    auto [fromversion, toversion] = versionrange;
+    if (last_complete < fromversion) {
+      // It is possible that we need to rollback pwlc, this can happen if
+      // peering is attempted with an OSD missing but does not manage to
+      // activate (typically because of a wait upthru) before the missing
+      // OSD returns
+      info.partial_writes_last_complete[shard] = std::pair(last_complete,
+                                                          last_complete);
+      // Assign the current epoch to the version number so that this is
+      // recognised as the newest pwlc update
+      info.partial_writes_last_complete[shard].second.epoch =
+       get_osdmap_epoch();
+      psdout(10) << "shard " << shard << " pwlc rolled back to "
+                << info.partial_writes_last_complete[shard] << dendl;
+    } else if (last_complete < toversion) {
+      info.partial_writes_last_complete[shard].second = last_complete;
+      psdout(10) << "shard " << shard << " pwlc rolled back to "
+                << info.partial_writes_last_complete[shard] << dendl;
+    }
+  }
+}
+
 void PeeringState::proc_master_log(
   ObjectStore::Transaction& t, pg_info_t &oinfo,
   pg_log_t&& olog, pg_missing_t&& omissing, pg_shard_t from)
@@ -3222,6 +3248,92 @@ void PeeringState::proc_master_log(
             << olog << " " << omissing << dendl;
   ceph_assert(!is_peered() && is_primary());
 
+  if (info.partial_writes_last_complete.contains(from.shard)) {
+    // Check if last_complete and last_update can be advanced based on
+    // knowledge of partial_writes
+    const auto & [fromversion, toversion] =
+      info.partial_writes_last_complete[from.shard];
+    if (toversion > oinfo.last_complete) {
+      if (fromversion <= oinfo.last_complete) {
+       psdout(10) << "osd." << from << " has last_complete "
+                  << oinfo.last_complete
+                  << " but pwlc says its at " << toversion << dendl;
+       oinfo.last_complete = toversion;
+       if (toversion > oinfo.last_update) {
+         oinfo.last_update = toversion;
+       }
+       if (toversion > olog.head) {
+         olog.head = toversion;
+       }
+      } else {
+       psdout(10) << "osd." << from << " has last_complete "
+                  << oinfo.last_complete << " cannot apply pwlc from "
+                  << fromversion << " to " << toversion << dendl;
+      }
+    }
+  }
+  // For partial writes we may be able to keep some of the divergent entries
+  if (olog.head < pg_log.get_head()) {
+    // Iterate backwards to divergence
+    auto p = pg_log.get_log().log.end();
+    while (true) {
+      if (p == pg_log.get_log().log.begin()) {
+       break;
+      }
+      --p;
+      if (p->version.version <= olog.head.version) {
+       break;
+      }
+    }
+    // See if we can wind forward partially written entries
+    map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
+    all_info[pg_whoami] = info;
+    while (p->version == olog.head) {
+      ++p;
+      if (p == pg_log.get_log().log.end()) {
+       break;
+      }
+      if (p->is_written_shard(from.shard)) {
+        psdout(10) << "entry " << p->version << " has written shards "
+                  << p->written_shards << " so is divergent" << dendl;
+       // This entry was meant to be written on from, this is the first
+       // divergent entry
+       break;
+      } else {
+       // Test if enough shards have the update
+       shard_id_set shards_with_update;
+       shard_id_set shards_without_update;
+        for (auto&& [pg_shard, pi] : all_info) {
+         psdout(20) << "version " << p->version
+                    << " testing osd " << pg_shard
+                    << " written=" << p->written_shards
+                    << " present=" << p->present_shards << dendl;
+         if (p->is_present_shard(pg_shard.shard) &&
+             p->is_written_shard(pg_shard.shard)) {
+           if (pi.last_update < p->version) {
+             if (!shards_with_update.contains(pg_shard.shard)) {
+               shards_without_update.insert(pg_shard.shard);
+             }
+           } else {
+             shards_with_update.insert(pg_shard.shard);
+             shards_without_update.erase(pg_shard.shard);
+           }
+         }
+       }
+       psdout(20) << "shards_with_update=" << shards_with_update
+                  << " shards_without_update=" << shards_without_update
+                  << dendl;
+       if (!shards_without_update.empty()) {
+         // A shard is missing this write - this is the first divergent entry
+         break;
+       }
+       // This entry can be kept, only shards that didn't participate in
+       // the partial write missed the update
+        psdout(20) << "keeping entry " << p->version << dendl;
+       olog.head = p->version;
+      }
+    }
+  }
   // merge log into our own log to build master log.  no need to
   // make any adjustments to their missing map; we are taking their
   // log to be authoritative (i.e., their entries are by definitely
@@ -3233,6 +3345,10 @@ void PeeringState::proc_master_log(
             << " " << omissing << dendl;
   might_have_unfound.insert(from);
 
+  // our log is now authoritative - update pwlc information based
+  // on the log head
+  consider_rollback_pwlc(pg_log.get_head());
+
   // See doc/dev/osd_internals/last_epoch_started
   if (oinfo.last_epoch_started > info.last_epoch_started) {
     info.last_epoch_started = oinfo.last_epoch_started;
index c47b7616a84aacac3d736c1a3290c60ff7b6322f..82c0082bb6e668921dce6127ba6bd19e827e6df0 100644 (file)
@@ -1764,6 +1764,7 @@ private:
     pg_log_t&& olog, pg_shard_t from);
 
   void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
+  void consider_rollback_pwlc(eversion_t last_complete);
   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo,
                       pg_log_t&& olog, pg_missing_t&& omissing,
                       pg_shard_t from);