]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd: Do not remove objects with divergent logs if only partial writes.
authorAlex Ainscow <aainscow@uk.ibm.com>
Fri, 19 Dec 2025 09:04:55 +0000 (09:04 +0000)
committerAlex Ainscow <aainscow@uk.ibm.com>
Mon, 22 Dec 2025 14:54:48 +0000 (14:54 +0000)
Fixes https://tracker.ceph.com/issues/74221

Note: An AI was used to assist generating unit tests for this commit.
      The production code was written by the author.

In the scenario we are fixing here, there is a divergent log, which needs to
be rolled back. The non-primary does not participate in the transaction to
the object, but the log exists describing the transaction.  The primary has
a different transaction and has correctly detected the divergence.

The primary correctly concludes that no recovery is needed for the object, since
only partial writes exist on the non-primary.

The non-primary observes its divergent log and incorrectly concludes that
recovery IS needed for the divergent write and prepares by removing that
object.

The consequence of this depends on the next operation:
1. A read will fail with -EIO
2. A RMW involving a read from the removed object  will detect the failure
   and reconstruct the necessary data.
3. A RMW not involve the write or an append will recreate the object, but with
   zeros, so will cause data corruption. A

It is unusual for such a log entry to exist on the non-primary because
normally those are omitted from the non-primary log. The scenario that causes
this when a partial write triggers a clone due to copy on write.  We now have
a clone operation which affects ALL shards and so the log entry is sent to
all shards.

This is unusual to see in the field. We must have all of the following:

1. A clone operation (these are infrequent)
2. A partial write.
3. A peering cycle must happen before this write is complete.

The combination of 1 and 3 make this a very unusual operation in teuthology
and will be even rarer in the field.

The fix ensures we skip divergent log entries for partial writes that the shard
did not participate in.

The following is a minimal script to recreate:

set -e -x

MDS=0 MON=1 OSD=4 MGR=1 ../src/vstart.sh --debug --new -x --localhost -o timeout=10000 -o session_timeout=10000 -o debug_osd=20

ceph osd pool set noautoscale
ceph balancer off
ceph osd set nodeep-scrub
ceph osd set noscrub
ceph osd set noout

ceph config set global bluestore_debug_inject_read_err true

dd if=/dev/random of=file_8k bs=8k count=1
dd if=/dev/random of=file_4k bs=4k count=1

ceph osd erasure-code-profile set alex k=2 m=2
ceph osd pool create mypool --pg_num=1 --pool_type=erasure alex
ceph osd pool set mypool allow_ec_overwrites true
ceph osd pool set mypool allow_ec_optimizations true
ceph osd pool set mypool min_size 2

rados put -p mypool test1 file_8k

acting_set=$(ceph osd map mypool test1 --format=json | jq -r '.acting[]')
acting_array=($acting_set)

shard_0_osd=${acting_array[0]}
shard_1_osd=${acting_array[1]}

echo "Shard 0 OSD: $shard_0_osd"
echo "Shard 1 OSD: $shard_1_osd"

ceph daemon osd.$shard_0_osd injectecwriteerr mypool "*" 2 1 0 1

rados -p mypool mksnap test1_snap
rados put -p mypool test1 file_4k --offset 0 &

ceph osd set noup
ceph osd down $shard_1_osd

wait

ceph osd unset noup

rados -p mypool mksnap test1_snap2
rados put -p mypool test1 file_4k --offset 0

Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/PeeringState.cc
src/test/osd/TestPGLog.cc

index 7524228c0e70182028dedb468e097a0ff263c715..e457883c87aa281f7d3925f8c75f51b6c738988f 100644 (file)
@@ -226,6 +226,7 @@ void PGLog::proc_replica_log(
   const pg_log_t &olog,
   pg_missing_t& omissing,
   pg_shard_t from,
+  const pg_shard_t &to,
   bool ec_optimizations_enabled) const
 {
   dout(10) << "proc_replica_log for osd." << from << ": "
@@ -302,8 +303,9 @@ void PGLog::proc_replica_log(
     oinfo,
     olog.get_can_rollback_to(),
     omissing,
-    0,
+    nullptr,
     ec_optimizations_enabled,
+    to.shard,
     this);
 
   if (lu < oinfo.last_update) {
@@ -337,7 +339,8 @@ void PGLog::proc_replica_log(
 void PGLog::rewind_divergent_log(eversion_t newhead,
                                 pg_info_t &info, LogEntryHandler *rollbacker,
                                 bool &dirty_info, bool &dirty_big_info,
-                                bool ec_optimizations_enabled)
+                                bool ec_optimizations_enabled,
+                                const pg_shard_t &shard)
 {
   dout(10) << "rewind_divergent_log truncate divergent future " <<
     newhead << dendl;
@@ -375,6 +378,7 @@ void PGLog::rewind_divergent_log(eversion_t newhead,
     missing,
     rollbacker,
     ec_optimizations_enabled,
+    shard.shard,
     this);
 
   dirty_info = true;
@@ -444,7 +448,8 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
   // do we have divergent entries to throw out?
   if (olog.head < log.head) {
     rewind_divergent_log(olog.head, info, rollbacker,
-                        dirty_info, dirty_big_info, ec_optimizations_enabled);
+                        dirty_info, dirty_big_info, ec_optimizations_enabled,
+                        toosd);
     changed = true;
   }
 
@@ -504,6 +509,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t&& olog, pg_shard_t fromosd,
       missing,
       rollbacker,
       ec_optimizations_enabled,
+      toosd.shard,
       this);
 
     info.last_update = log.head = olog.head;
index 1161d0901a9e3a3baca664062b30bdcf4e51dfcb..561ad1c766c56951e6140fd72a7d3098f1846bda 100644 (file)
@@ -1046,6 +1046,7 @@ public:
   void proc_replica_log(pg_info_t &oinfo,
                        const pg_log_t &olog,
                        pg_missing_t& omissing, pg_shard_t from,
+                       const pg_shard_t &to,
                        bool ec_optimizations_enabled) const;
 
   void set_missing_may_contain_deletes() {
@@ -1097,6 +1098,7 @@ protected:
     missing_type &missing,               ///< [in,out] missing to adjust, use
     LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
     bool ec_optimizations_enabled,       ///< [in] relax asserts for allow_ec_optimzations pools
+    shard_id_t orig_shard,               ///< [in] Which shard has orig_entries
     const DoutPrefixProvider *dpp        ///< [in] logging provider
     ) {
     ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid
@@ -1146,11 +1148,14 @@ protected:
        }
       }
       if (i->is_error()) {
-       ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
+ ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl;
+      } else if (!i->written_shards.empty() && !i->written_shards.contains(orig_shard)) {
+        ldpp_dout(dpp, 20) << __func__ << ": ignoring partial write " << *i << dendl;
+ last = i->version;
       } else {
      ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
      entries.push_back(*i);
      last = i->version;
+ ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl;
+ entries.push_back(*i);
+ last = i->version;
       }
     }
     if (entries.empty()) {
@@ -1326,6 +1331,7 @@ protected:
     missing_type &omissing,              ///< [in,out] missing to adjust, use
     LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
     bool ec_optimizations_enabled,       ///< [in] relax asserts for allow_ec_optimzations pools
+    shard_id_t orig_shard,               ///< [in] Which shard is this (for detecting partial writes)
     const DoutPrefixProvider *dpp        ///< [in] logging provider
     ) {
     std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split;
@@ -1340,6 +1346,7 @@ protected:
        omissing,
        rollbacker,
        ec_optimizations_enabled,
+       orig_shard,
        dpp);
     }
   }
@@ -1364,6 +1371,7 @@ protected:
       missing,
       rollbacker,
       false, // not allow_ec_optimizations pool
+      shard_id_t(0), // Test doesn't care about this value.
       this);
   }
 
@@ -1376,7 +1384,8 @@ public:
                             LogEntryHandler *rollbacker,
                             bool &dirty_info,
                             bool &dirty_big_info,
-                           bool ec_optimizations_enabled);
+                           bool ec_optimizations_enabled,
+                           const pg_shard_t &shard);
 
   void merge_log(pg_info_t &oinfo,
                 pg_log_t&& olog,
index b6a7ceeec726204c1bf7339701884e04408d01d7..3a05c77a084546066f026db6a509686067337509 100644 (file)
@@ -3291,7 +3291,7 @@ void PeeringState::rewind_divergent_log(
   PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
   pg_log.rewind_divergent_log(
     newhead, info, rollbacker.get(), dirty_info, dirty_big_info,
-    pool.info.allows_ecoptimizations());
+    pool.info.allows_ecoptimizations(), pg_whoami);
 }
 
 
@@ -3538,7 +3538,7 @@ void PeeringState::proc_replica_log(
     apply_pwlc(info.partial_writes_last_complete[from.shard], from, oinfo,
               &olog);
   }
-  pg_log.proc_replica_log(oinfo, olog, omissing, from, pool.info.allows_ecoptimizations());
+  pg_log.proc_replica_log(oinfo, olog, omissing, from, pg_whoami, pool.info.allows_ecoptimizations());
 
   peer_info[from] = oinfo;
   update_peer_info(from, oinfo);
index 8e7993550e56f79d6401dd8cd0d57fad3d985178..7871fb8502dbf00f2eeb8079cdd5f63d2bfa1d1b 100644 (file)
@@ -121,6 +121,16 @@ struct PGLogTestBase {
     const hobject_t &hoid, eversion_t v, eversion_t pv) {
     return mk_ple_dt_rb(hoid, v, pv, osd_reqid_t());
   }
+  static pg_log_entry_t mk_ple_clone(
+    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    pg_log_entry_t e;
+    e.mark_unrollbackable();
+    e.op = pg_log_entry_t::CLONE;
+    e.soid = hoid;
+    e.version = v;
+    e.prior_version = pv;
+    return e;
+  }
   static pg_log_entry_t mk_ple_err(
     const hobject_t &hoid, eversion_t v) {
     return mk_ple_err(hoid, v, osd_reqid_t());
@@ -316,7 +326,8 @@ public:
     pg_info_t oinfo = tcase.get_divinfo();
 
     proc_replica_log(
-      oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)), false);
+      oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)),
+      pg_shard_t(0, shard_id_t(1)), false);
 
     ceph_assert(oinfo.last_update >= log.tail);
 
@@ -439,7 +450,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
 
     TestHandler h(remove_snap);
     rewind_divergent_log(newhead, info, &h,
-                        dirty_info, dirty_big_info, false);
+                        dirty_info, dirty_big_info, false, pg_shard_t());
 
     EXPECT_TRUE(log.objects.count(divergent));
     EXPECT_TRUE(missing.is_missing(divergent_object));
@@ -504,7 +515,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
 
     TestHandler h(remove_snap);
     rewind_divergent_log(newhead, info, &h,
-                        dirty_info, dirty_big_info, false);
+                        dirty_info, dirty_big_info, false, pg_shard_t());
 
     EXPECT_TRUE(missing.is_missing(divergent_object));
     EXPECT_EQ(0U, log.objects.count(divergent_object));
@@ -543,7 +554,7 @@ TEST_F(PGLogTest, rewind_divergent_log) {
     TestHandler h(remove_snap);
     roll_forward_to(eversion_t(1, 6), &info, &h);
     rewind_divergent_log(eversion_t(1, 5), info, &h,
-                        dirty_info, dirty_big_info, false);
+                        dirty_info, dirty_big_info, false, pg_shard_t());
     pg_log_t log;
     reset_backfill_claim_log(log, &info, &h);
   }
@@ -1422,7 +1433,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(last_complete, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_FALSE(omissing.have_missing());
     EXPECT_EQ(last_update, oinfo.last_update);
@@ -1496,7 +1508,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_FALSE(omissing.have_missing());
   }
@@ -1598,7 +1611,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1685,7 +1699,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1775,7 +1790,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.is_missing(divergent_object));
@@ -1869,7 +1885,8 @@ TEST_F(PGLogTest, proc_replica_log) {
     EXPECT_EQ(olog.head, oinfo.last_complete);
 
     missing.may_include_deletes = false;
-    proc_replica_log(oinfo, olog, omissing, from, false);
+    proc_replica_log(oinfo, olog, omissing, from,
+      pg_shard_t(0, shard_id_t(1)), false);
 
     EXPECT_TRUE(omissing.have_missing());
     EXPECT_TRUE(omissing.get_items().begin()->second.need == eversion_t(1, 1));
@@ -2949,7 +2966,7 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) {
                                     orig_entries, oinfo,
                                     log.get_can_rollback_to(),
                                     missing, &rollbacker,
-                                    false, this);
+                                    false, shard_id_t(0), this);
     // No core dump
   }
   {
@@ -2975,11 +2992,541 @@ TEST_F(PGLogTest, _merge_object_divergent_entries) {
                                     orig_entries, oinfo,
                                     log.get_can_rollback_to(),
                                     missing, &rollbacker,
-                                    false, this);
+                                    false, shard_id_t(0), this);
     // No core dump
   }
 }
 
+TEST_F(PGLogTest, merge_object_divergent_entries_partial_writes) {
+  {
+    // Test case 1: Partial write on shard that did NOT participate
+    // - should NOT remove object
+    clear();
+    hobject_t hoid = mk_obj(1);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    // Partial write: only shards 0 and 1 were written
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 2 did NOT participate in the partial write
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object should NOT be removed
+    EXPECT_EQ(0U, rollbacker.removed.size());
+  }
+  
+  {
+    // Test case 2: Partial write on shard that DID participate
+    // - SHOULD remove object
+    clear();
+    hobject_t hoid = mk_obj(2);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 1 DID participate in the partial write
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(1), this);
+    
+    // Object SHOULD be removed (Case 1: more recent entry in log)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+    // Object should be in missing with have=eversion_t()
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(hoid).have);
+  }
+  
+  {
+    // Test case 3: Empty written_shards (full write)
+    // - SHOULD remove object on any shard
+    clear();
+    hobject_t hoid = mk_obj(3);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    // Empty written_shards = full write to all shards
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Any shard should remove for full writes
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(5), this);
+    
+    // Object SHOULD be removed (Case 1: more recent entry in log)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+    // Object should be in missing with have=eversion_t()
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(hoid).have);
+  }
+  
+  {
+    // Test case 4: Multiple entries, shard participated in one
+    clear();
+    hobject_t hoid = mk_obj(4);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    
+    // First entry: partial write to shards 0, 1
+    pg_log_entry_t entry1 = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry1.written_shards.insert(shard_id_t(0));
+    entry1.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry1);
+    
+    // Second entry: partial write to shards 2, 3
+    pg_log_entry_t entry2 = mk_ple_mod(hoid, eversion_t(10, 101), eversion_t(10, 100));
+    entry2.written_shards.insert(shard_id_t(2));
+    entry2.written_shards.insert(shard_id_t(3));
+    orig_entries.push_back(entry2);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 2 participated in second entry
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object SHOULD be removed (Case 1: more recent entry in log)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+    // Object should be in missing with have=eversion_t()
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(hoid).have);
+  }
+  
+  {
+    // Test case 5: Multiple entries, shard participated in none
+    clear();
+    hobject_t hoid = mk_obj(5);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    
+    pg_log_entry_t entry1 = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry1.written_shards.insert(shard_id_t(0));
+    entry1.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry1);
+    
+    pg_log_entry_t entry2 = mk_ple_mod(hoid, eversion_t(10, 101), eversion_t(10, 100));
+    entry2.written_shards.insert(shard_id_t(2));
+    entry2.written_shards.insert(shard_id_t(3));
+    orig_entries.push_back(entry2);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 5 did NOT participate in any entry
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(5), this);
+    
+    // Object should NOT be removed
+    EXPECT_EQ(0U, rollbacker.removed.size());
+  }
+  
+  {
+    // Test case 7: Case 1 - More recent entry in log with partial write
+    // Non-participating shard should NOT remove
+    clear();
+    hobject_t hoid = mk_obj(7);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    // Add more recent entry to log - triggers Case 1
+    log.add(mk_ple_mod(hoid, eversion_t(10, 105), eversion_t(10, 100)));
+    missing.add(hoid, eversion_t(10, 105), eversion_t(), false);
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 2 did NOT participate - Case 1 with partial write check
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object should NOT be removed (shard didn't participate)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+    // Missing should be updated
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(hoid).have);
+  }
+  
+  {
+    // Test case 8: Case 3 - Object in missing with have != prior_version
+    // and participating shard - should revise need, not remove
+    clear();
+    hobject_t hoid = mk_obj(8);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(10, 105), eversion_t(10, 100)));
+    missing.add(hoid, eversion_t(10, 105), eversion_t(), false);
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Case 3: object in missing, will revise need (not Case 1)
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(1), this);
+    
+    // Object should NOT be removed (Case 3 doesn't remove)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+    // Should revise need
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(10, 99), missing.get_items().at(hoid).need);
+  }
+  
+  {
+    // Test case 9: Case 2 - prior_version is eversion_t() (creation)
+    // Should always remove regardless of shard
+    clear();
+    hobject_t hoid = mk_obj(9);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t());
+    // Creation - no partial write, written_shards should be empty
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Case 2: prior_version is eversion_t() - always removes
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object SHOULD be removed (creation case)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+  }
+  
+  {
+    // Test case 10: Case 3 - Object in missing, have == prior_version
+    // Should remove from missing but not call rollbacker->remove()
+    clear();
+    hobject_t hoid = mk_obj(10);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    // Add to missing with have == prior_version - triggers Case 3
+    missing.add(hoid, eversion_t(11, 110), eversion_t(10, 99), false);
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Case 3: missing.have == prior_version, shard participated
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(1), this);
+    
+    // Object should NOT be removed (Case 3 doesn't call remove)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+    // Should be removed from missing
+    EXPECT_FALSE(missing.is_missing(hoid));
+  }
+  
+  {
+    // Test case 11: Case 3 - Object in missing, have != prior_version
+    // Should revise need but not remove object
+    clear();
+    hobject_t hoid = mk_obj(11);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    // Add to missing with have != prior_version
+    missing.add(hoid, eversion_t(11, 110), eversion_t(10, 95), false);
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Case 3: missing.have != prior_version, shard participated
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(1), this);
+    
+    // Object should NOT be removed (Case 3)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+    // Should still be in missing with revised need
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(10, 99), missing.get_items().at(hoid).need);
+  }
+  
+  {
+    // Test case 12: Case 5 - Cannot rollback, partial write, non-participant
+    // Should NOT add to missing and NOT remove object (entry skipped)
+    clear();
+    hobject_t hoid = mk_obj(12);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    // Not in missing - will hit Case 5
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Shard 2 didn't participate - entry will be skipped, function returns early
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object should NOT be removed (didn't participate)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+    // Should NOT be added to missing (entry was skipped)
+    EXPECT_FALSE(missing.is_missing(hoid));
+  }
+  
+  {
+    // Test case 13: Case 5 - Cannot rollback, full write
+    // Should remove object and add to missing
+    clear();
+    hobject_t hoid = mk_obj(13);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    // Empty written_shards = full write
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Case 5: cannot rollback, full write
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(5), this);
+    
+    // Object SHOULD be removed (full write)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+    // Should be added to missing
+    EXPECT_TRUE(missing.is_missing(hoid));
+  }
+  
+  {
+    // Test case 14: Multiple divergent entries, first has partial write, second is full
+    // Second entry is full write, so object SHOULD be removed
+    clear();
+    hobject_t hoid = mk_obj(14);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    
+    // First entry: partial write, shard 2 didn't participate
+    pg_log_entry_t entry1 = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry1.written_shards.insert(shard_id_t(0));
+    entry1.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry1);
+    
+    // Second entry: full write (empty written_shards)
+    pg_log_entry_t entry2 = mk_ple_mod(hoid, eversion_t(10, 101), eversion_t(10, 100));
+    orig_entries.push_back(entry2);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Loop checks all entries - second is full write
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, false, shard_id_t(2), this);
+    
+    // Object SHOULD be removed (Case 1: more recent entry in log, second entry is full write)
+    EXPECT_EQ(1U, rollbacker.removed.size());
+    EXPECT_TRUE(rollbacker.removed.count(hoid));
+    // Object should be in missing with have=eversion_t()
+    EXPECT_TRUE(missing.is_missing(hoid));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(hoid).have);
+  }
+  
+  {
+    // Test case 15: EC optimizations enabled with partial write
+    // Should work correctly with ec_optimizations_enabled flag
+    clear();
+    hobject_t hoid = mk_obj(15);
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    pg_log_entry_t entry = mk_ple_mod(hoid, eversion_t(10, 100), eversion_t(10, 99));
+    entry.written_shards.insert(shard_id_t(0));
+    entry.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(entry);
+    
+    log.add(mk_ple_mod(hoid, eversion_t(11, 110), eversion_t(10, 99)));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Test with ec_optimizations_enabled = true
+    _merge_object_divergent_entries(log, hoid, orig_entries, oinfo,
+                                    log.get_can_rollback_to(), missing,
+                                    &rollbacker, true, shard_id_t(2), this);
+    
+    // Object should NOT be removed (shard didn't participate)
+    EXPECT_EQ(0U, rollbacker.removed.size());
+  }
+}
+
+TEST_F(PGLogTest, merge_divergent_entries_clone_with_partial_write) {
+  // Test the scenario where:
+  // 1. Clone head object to snap :1 (creates :1)
+  // 2. Partial write to head object (which also affects :1)
+  // The partial write log entry ends up on a non-primary shard
+  
+  {
+    // Test case 1: Shard did NOT participate in partial write
+    clear();
+    hobject_t head = mk_obj(100);
+    hobject_t snap1 = head;
+    snap1.snap = 1;
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    
+    // Clone head to snap1 - this creates snap1
+    pg_log_entry_t clone_entry = mk_ple_clone(snap1, eversion_t(10, 100), eversion_t());
+    orig_entries.push_back(clone_entry);
+    
+    // Partial write to head (which also affects snap1)
+    pg_log_entry_t partial_write = mk_ple_mod(head, eversion_t(10, 101), eversion_t(10, 100));
+    partial_write.written_shards.insert(shard_id_t(0));
+    partial_write.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(partial_write);
+    
+    // Add divergent entries in log
+    log.add(mk_ple_mod(head, eversion_t(11, 110), eversion_t(10, 99)));
+    log.add(mk_ple_mod(snap1, eversion_t(11, 111), eversion_t()));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Process for shard 2 which did NOT participate in the partial write
+    // The clone entry should be kept (creates snap1)
+    // The partial write entry should be skipped (shard 2 didn't participate)
+    _merge_divergent_entries(
+      log,
+      orig_entries,
+      oinfo,
+      log.get_can_rollback_to(),
+      missing,
+      &rollbacker,
+      false,
+      shard_id_t(2),
+      this);
+    
+    // snap1 should be removed (it was created by clone, prior_version is eversion_t())
+    EXPECT_TRUE(rollbacker.removed.count(snap1));
+    // head should NOT be removed (shard 2 didn't participate in partial write)
+    EXPECT_FALSE(rollbacker.removed.count(head));
+    // snap1 should NOT be in missing (Case 2: creation, object is simply deleted)
+    EXPECT_FALSE(missing.is_missing(snap1));
+    // head should NOT be in missing (entry was skipped, no divergent entries processed)
+    EXPECT_FALSE(missing.is_missing(head));
+  }
+  
+  {
+    // Test case 2: Shard DID participate in partial write
+    clear();
+    hobject_t head = mk_obj(101);
+    hobject_t snap1 = head;
+    snap1.snap = 1;
+    
+    mempool::osd_pglog::list<pg_log_entry_t> orig_entries;
+    
+    // Clone head to snap1
+    pg_log_entry_t clone_entry = mk_ple_clone(snap1, eversion_t(10, 100), eversion_t());
+    orig_entries.push_back(clone_entry);
+    
+    // Partial write to head
+    pg_log_entry_t partial_write = mk_ple_mod(head, eversion_t(10, 101), eversion_t(10, 100));
+    partial_write.written_shards.insert(shard_id_t(0));
+    partial_write.written_shards.insert(shard_id_t(1));
+    orig_entries.push_back(partial_write);
+    
+    // Add divergent entries in log
+    log.add(mk_ple_mod(head, eversion_t(11, 110), eversion_t(10, 99)));
+    log.add(mk_ple_mod(snap1, eversion_t(11, 111), eversion_t()));
+    
+    pg_info_t oinfo;
+    LogHandler rollbacker;
+    
+    // Process for shard 1 which DID participate in the partial write
+    _merge_divergent_entries(
+      log,
+      orig_entries,
+      oinfo,
+      log.get_can_rollback_to(),
+      missing,
+      &rollbacker,
+      false,
+      shard_id_t(1),
+      this);
+    
+    // Both snap1 and head should be removed
+    EXPECT_TRUE(rollbacker.removed.count(snap1));
+    EXPECT_TRUE(rollbacker.removed.count(head));
+    // snap1 should NOT be in missing (Case 2: creation, object is simply deleted)
+    EXPECT_FALSE(missing.is_missing(snap1));
+    // head should be in missing (Case 1: more recent entry in log)
+    EXPECT_TRUE(missing.is_missing(head));
+    EXPECT_EQ(eversion_t(), missing.get_items().at(head).have);
+  }
+}
+
+
+
 TEST(eversion_t, get_key_name) {
   eversion_t a(1234, 5678);
   std::string a_key_name = a.get_key_name();