]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: EC Optimizations: Relax reset_complete_to for partial writes 62522/head
authorBill Scales <bill_scales@uk.ibm.com>
Wed, 26 Mar 2025 10:46:07 +0000 (10:46 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Tue, 22 Apr 2025 07:04:24 +0000 (08:04 +0100)
EC Optimized pools can have shards missing log entries because
of partial writes. This means it is possible to have a missing
entry with a newer version than the log. Relax an assert in
reset_complete_to to avoid this.

reset_complete_to also resets last_complete to 0 when the
oldest missing object is before the first log entry. This
is to aggressive for partial writes and needs to be relaxed.

Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
src/osd/PGLog.h
src/osd/PeeringState.cc

index 4d899f2356b38ab0da3d115faccfbcf7e0192776..a6434098d2d34299045bc590aff44e062e7b1df5 100644 (file)
@@ -929,7 +929,7 @@ public:
     ceph_assert(log.get_can_rollback_to() >= v);
   }
 
-  void reset_complete_to(pg_info_t *info) {
+  void reset_complete_to(pg_info_t *info, bool ec_optimizations_enabled) {
     if (log.log.empty()) // caller is split_into()
       return;
     log.complete_to = log.log.begin();
@@ -938,13 +938,26 @@ public:
     if (oldest_need != eversion_t()) {
       while (log.complete_to->version < oldest_need) {
         ++log.complete_to;
+       // partial writes allow a shard which did not participate in a write to
+       // have a missing version that is newer that the most recent log entry
+       if (ec_optimizations_enabled && (log.complete_to == log.log.end())) {
+         break;
+       }
         ceph_assert(log.complete_to != log.log.end());
       }
     }
     if (!info)
       return;
     if (log.complete_to == log.log.begin()) {
-      info->last_complete = eversion_t();
+      // partial writes use last complete to track shards that did not
+      // participate in a write - do not reset it unnecessarily
+      if (!ec_optimizations_enabled) {
+       info->last_complete = eversion_t();
+      } else if ((oldest_need != eversion_t()) &&
+                info->last_complete >= oldest_need) {
+       info->last_complete = eversion_t(oldest_need.epoch,
+                                        oldest_need.version - 1);
+      }
     } else {
       --log.complete_to;
       info->last_complete = log.complete_to->version;
@@ -952,8 +965,8 @@ public:
     }
   }
 
-  void activate_not_complete(pg_info_t &info) {
-    reset_complete_to(&info);
+  void activate_not_complete(pg_info_t &info, bool ec_optimizations_enabled) {
+    reset_complete_to(&info, ec_optimizations_enabled);
     log.last_requested = 0;
   }
 
@@ -1322,7 +1335,8 @@ public:
   bool append_new_log_entries(
     const hobject_t &last_backfill,
     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
-    LogEntryHandler *rollbacker) {
+    LogEntryHandler *rollbacker,
+    bool ec_optimizations_enabled) {
     bool invalidate_stats = append_log_entries_update_missing(
       last_backfill,
       entries,
@@ -1342,7 +1356,7 @@ public:
        // always in a std::list of solely lost_delete entries, so it is
        // sufficient to check whether the first entry is a
        // lost_delete
-       reset_complete_to(nullptr);
+       reset_complete_to(nullptr, ec_optimizations_enabled);
       }
     }
     return invalidate_stats;
index 3d0a74dcf6f8f18a61274c7717f647f83582afa3..89f6037406ef7467603963f3d7c638cd6bfe8d7f 100644 (file)
@@ -2837,7 +2837,7 @@ void PeeringState::activate(
   } else {
     psdout(10) << "activate - not complete, " << missing << dendl;
     info.stats.stats.sum.num_objects_missing = missing.num_missing();
-    pg_log.activate_not_complete(info);
+    pg_log.activate_not_complete(info, pool.info.allows_ecoptimizations());
   }
 
   log_weirdness();
@@ -3389,6 +3389,8 @@ void PeeringState::try_mark_clean()
 void PeeringState::split_into(
   pg_t child_pgid, PeeringState *child, unsigned split_bits)
 {
+  bool ec_optimizations_enabled = pool.info.allows_ecoptimizations();
+
   child->update_osdmap_ref(get_osdmap());
   child->pool = pool;
 
@@ -3407,8 +3409,8 @@ void PeeringState::split_into(
   child->info.log_tail = child->pg_log.get_tail();
 
   // reset last_complete, we might have modified pg_log & missing above
-  pg_log.reset_complete_to(&info);
-  child->pg_log.reset_complete_to(&child->info);
+  pg_log.reset_complete_to(&info, ec_optimizations_enabled);
+  child->pg_log.reset_complete_to(&child->info, ec_optimizations_enabled);
 
   // Info
   child->info.history = info.history;
@@ -4227,7 +4229,8 @@ bool PeeringState::append_log_entries_update_missing(
     pg_log.append_new_log_entries(
       info.last_backfill,
       entries,
-      rollbacker.get());
+      rollbacker.get(),
+      pool.info.allows_ecoptimizations());
 
   if (pg_committed_to && entries.rbegin()->soid > info.last_backfill) {
     pg_log.roll_forward(&info, rollbacker.get());
@@ -4502,7 +4505,7 @@ void PeeringState::force_object_missing(
       peer_missing[peer].add(soid, version, eversion_t(), false);
     } else {
       pg_log.missing_add(soid, version, eversion_t());
-      pg_log.reset_complete_to(&info);
+      pg_log.reset_complete_to(&info, pool.info.allows_ecoptimizations());
       pg_log.set_last_requested(0);
     }
   }