]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd/pg_recovery: call MOSDPGRecoveryDelete instead of MOSDPGBackfillRemove 67925/head
authorShraddha Agrawal <shraddha.agrawal000@gmail.com>
Thu, 19 Mar 2026 08:01:28 +0000 (13:31 +0530)
committerShraddha Agrawal <shraddha.agrawal000@gmail.com>
Tue, 31 Mar 2026 09:53:54 +0000 (15:23 +0530)
This commit fixes the abort in Recovered::Recovered.

There is a race to acquire the OBC lock between backfill and
client delete for the same object.

When the lock is acquired first by the backfill, the object is
recovered first, and then deleted by the client delete request.
When recovering the object, the corresponding peer_missing entry
is cleared and we are able to transition to Recovered state
successfully.

When the lock is acquired first by client delete request, the
object is deleted. Then backfill tries to recover the object,
finds it deleted and exists early. The stale peer_missing
entry is not cleared. In Recovered::Recovered, needs_recovery()
sees this stale peer_missing entry and calls abort.

The issue is fixed by sending MOSDPGRecoveryDelete from the client
path to peers and waiting for MOSDPGRecoveryDeleteReply in
recover_object.

Fixes: https://tracker.ceph.com/issues/70501
Signed-off-by: Shraddha Agrawal <shraddha.agrawal000@gmail.com>
src/crimson/osd/backfill_state.cc
src/crimson/osd/backfill_state.h
src/crimson/osd/pg_recovery.cc
src/crimson/osd/pg_recovery.h
src/crimson/osd/recovery_backend.h
src/crimson/osd/replicated_recovery_backend.cc
src/test/crimson/test_backfill.cc

index 2e39d2f6629acdde8dfaf37cb0bf3011208a704a..411b009d489195af8c5a171b8179e91a42d34370 100644 (file)
@@ -747,10 +747,7 @@ void BackfillState::enqueue_standalone_delete(
   const eversion_t &v,
   const std::vector<pg_shard_t> &peers)
 {
-  progress_tracker->enqueue_drop(obj);
-  for (auto bt : peers) {
-    backfill_machine.backfill_listener.enqueue_drop(bt, obj, v);
-  }
+  backfill_machine.backfill_listener.send_recovery_deletes(obj, peers);
 }
 
 std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) {
index 5ca001b7dae06f90ed44833168e1568e8017f0f1..bd30ad9778e25e5521f2c56ffca46fc6b16e7134 100644 (file)
@@ -374,6 +374,10 @@ struct BackfillState::BackfillListener {
     const hobject_t& obj,
     const eversion_t& v) = 0;
 
+  virtual void send_recovery_deletes(
+    const hobject_t& obj,
+    const std::vector<pg_shard_t>& peers) = 0;
+
   virtual void maybe_flush() = 0;
 
   virtual void update_peers_last_backfill(
index 7e1ed5858cf2d6e7662d6e26d75a37bca6110185..e8828d7e84c65726620c5051dbb81ff726104da8 100644 (file)
@@ -15,6 +15,7 @@
 #include "crimson/osd/pg_backend.h"
 #include "crimson/osd/pg_recovery.h"
 
+#include "messages/MOSDPGRecoveryDelete.h"
 #include "osd/osd_types.h"
 #include "osd/PeeringState.h"
 
@@ -574,6 +575,33 @@ void PGRecovery::enqueue_drop(
   req->ls.emplace_back(obj, v);
 }
 
+void PGRecovery::send_recovery_deletes(
+  const hobject_t& obj,
+  const std::vector<pg_shard_t>& peers)
+{
+  LOG_PREFIX(PGRecovery::send_recovery_deletes);
+  DEBUGDPP("obj={}", *pg->get_dpp(), obj);
+  if (!pg->get_recovery_backend()->is_recovering(obj)) {
+    DEBUGDPP("obj={} is not recovering, exiting early", *pg->get_dpp(), obj); 
+    return;
+  }
+  auto& peering_state = pg->get_peering_state();
+  epoch_t min_epoch = pg->get_last_peering_reset();
+  auto& recovering = pg->get_recovery_backend()->get_recovering(obj);
+  for (const auto& peer : peers) {
+    pg_missing_item item;
+    if (peering_state.get_peer_missing(peer).is_missing(obj, &item)) {
+      std::ignore = recovering.wait_for_pushes(peer);
+      spg_t target_pg(pg->get_pgid().pgid, peer.shard);
+      auto msg = crimson::make_message<MOSDPGRecoveryDelete>(
+        pg->get_pg_whoami(), target_pg, pg->get_osdmap_epoch(), min_epoch);
+      msg->objects.push_back(std::make_pair(obj, item.need));
+      std::ignore = pg->get_shard_services().send_to_osd(
+        peer.osd, std::move(msg), pg->get_osdmap_epoch());
+    }
+  }
+}
+
 void PGRecovery::maybe_flush()
 {
   for (auto& [target, req] : backfill_drop_requests) {
index 32976dfcea6d21c72ec385375ccf186bfc8f811a..f3736d4cbee3a48db1937564be65d6814d87055b 100644 (file)
@@ -136,6 +136,9 @@ private:
     const pg_shard_t& target,
     const hobject_t& obj,
     const eversion_t& v) final;
+  void send_recovery_deletes(
+    const hobject_t& obj,
+    const std::vector<pg_shard_t>& peers) final;
   void maybe_flush() final;
   void update_peers_last_backfill(
     const hobject_t& new_last_backfill) final;
index e13e349e6c8d02f307eea33da15007c09e8d7320..5e164384e43619a5fbfc6698a36b988aff7ab187 100644 (file)
@@ -189,6 +189,15 @@ public:
     seastar::future<> wait_for_pushes(pg_shard_t shard) {
       return pushes[shard].get_shared_future();
     }
+    bool has_pushes() const {
+      return !pushes.empty();
+    }
+    seastar::future<> wait_for_all_pushes() {
+      return seastar::parallel_for_each(pushes,
+       [](auto& entry) {
+         return entry.second.get_shared_future();
+       });
+    }
     seastar::future<> wait_for_recovered() {
       if (!recovered) {
        recovered = seastar::shared_promise<>();
index 0711ec430ed99d981d969c65143518434ea345e6..a1f2f99caabfb6d5f86584627cde8f538dee1265 100644 (file)
@@ -43,15 +43,30 @@ ReplicatedRecoveryBackend::recover_object(
     DEBUGDPP("loading obc: {}", pg, soid);
     return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
       [FNAME, this, soid, need](auto head, auto obc) {
-      if (!obc->obs.exists) {
-        // XXX: this recovery must be triggered by backfills and the corresponding
-        //      object must have been deleted by some client request after the object
-        //      is enqueued for push but before the lock is acquired by the recovery.
-        //
-        //      Abort the recovery in this case, a "recover_delete" must have been
-        //      added for this object by the client request that deleted it.
-        return interruptor::now();
-      }
+           if (!obc->obs.exists) {
+             // XXX: this recovery must be triggered by backfills and the corresponding
+             //      object must have been deleted by some client request after the object
+             //      is enqueued for push but before the lock is acquired by the recovery.
+             //
+             //      Abort the recovery in this case. A MOSDPGRecoveryDelete must have been
+             //      sent, for this object to peers, by the client request that deleted it.
+             DEBUGDPP("obj={}, v={} not found on primary, aborting backfill", pg, soid, need);
+      
+             // if client delete request sent MOSDPGRecoveryDelete, we need to wait 
+             // for MOSDPGRecoveryDeleteReply from peers. 
+             auto& recovery_waiter = get_recovering(soid);
+             if (recovery_waiter.has_pushes()) {
+               DEBUGDPP("obj={}, v={} waiting for pushes", pg, soid, need);
+               return interruptor::make_interruptible(
+                 recovery_waiter.wait_for_all_pushes()
+               ).then_interruptible([this, soid] {
+                 object_stat_sum_t stat_diff;
+                 stat_diff.num_objects_recovered = 1;
+                 pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+               });
+             }
+             return interruptor::now();
+           }
       DEBUGDPP("loaded obc: {}", pg, obc->obs.oi.soid);
       auto& recovery_waiter = get_recovering(soid);
       recovery_waiter.obc = obc;
index 8a486570b85b925e391580007b53576c3831a1e2..2200d2810dee03b3b6bb5c71b47b87c92ac8b782 100644 (file)
@@ -166,6 +166,10 @@ class BackfillFixture : public crimson::osd::BackfillState::BackfillListener {
     const hobject_t& obj,
     const eversion_t& v) override;
 
+  void send_recovery_deletes(
+    const hobject_t& obj,
+    const std::vector<pg_shard_t>& peers) override;
+
   void maybe_flush() override;
 
   void update_peers_last_backfill(
@@ -399,6 +403,13 @@ void BackfillFixture::enqueue_drop(
   enqueued_drops[target].emplace_back(obj, v);
 }
 
+void BackfillFixture::send_recovery_deletes(
+  const hobject_t& obj,
+  const std::vector<pg_shard_t>& peers)
+{
+  // no-op in test mock
+}
+
 void BackfillFixture::maybe_flush()
 {
   for (const auto& [target, versioned_objs] : enqueued_drops) {