This commit fixes the abort in Recovered::Recovered.
There is a race to acquire the OBC lock between backfill and
client delete for the same object.
When the lock is acquired first by the backfill, the object is
recovered first, and then deleted by the client delete request.
When recovering the object, the corresponding peer_missing entry
is cleared and we are able to transition to Recovered state
successfully.
When the lock is acquired first by client delete request, the
object is deleted. Then backfill tries to recover the object,
finds it deleted and exists early. The stale peer_missing
entry is not cleared. In Recovered::Recovered, needs_recovery()
sees this stale peer_missing entry and calls abort.
The issue is fixed by sending MOSDPGRecoveryDelete from the client
path to peers and waiting for MOSDPGRecoveryDeleteReply in
recover_object.
Fixes: https://tracker.ceph.com/issues/70501
Signed-off-by: Shraddha Agrawal <shraddha.agrawal000@gmail.com>
const eversion_t &v,
const std::vector<pg_shard_t> &peers)
{
- progress_tracker->enqueue_drop(obj);
- for (auto bt : peers) {
- backfill_machine.backfill_listener.enqueue_drop(bt, obj, v);
- }
+ backfill_machine.backfill_listener.send_recovery_deletes(obj, peers);
}
std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) {
const hobject_t& obj,
const eversion_t& v) = 0;
+ virtual void send_recovery_deletes(
+ const hobject_t& obj,
+ const std::vector<pg_shard_t>& peers) = 0;
+
virtual void maybe_flush() = 0;
virtual void update_peers_last_backfill(
#include "crimson/osd/pg_backend.h"
#include "crimson/osd/pg_recovery.h"
+#include "messages/MOSDPGRecoveryDelete.h"
#include "osd/osd_types.h"
#include "osd/PeeringState.h"
req->ls.emplace_back(obj, v);
}
+void PGRecovery::send_recovery_deletes(
+ const hobject_t& obj,
+ const std::vector<pg_shard_t>& peers)
+{
+ LOG_PREFIX(PGRecovery::send_recovery_deletes);
+ DEBUGDPP("obj={}", *pg->get_dpp(), obj);
+ if (!pg->get_recovery_backend()->is_recovering(obj)) {
+ DEBUGDPP("obj={} is not recovering, exiting early", *pg->get_dpp(), obj);
+ return;
+ }
+ auto& peering_state = pg->get_peering_state();
+ epoch_t min_epoch = pg->get_last_peering_reset();
+ auto& recovering = pg->get_recovery_backend()->get_recovering(obj);
+ for (const auto& peer : peers) {
+ pg_missing_item item;
+ if (peering_state.get_peer_missing(peer).is_missing(obj, &item)) {
+ std::ignore = recovering.wait_for_pushes(peer);
+ spg_t target_pg(pg->get_pgid().pgid, peer.shard);
+ auto msg = crimson::make_message<MOSDPGRecoveryDelete>(
+ pg->get_pg_whoami(), target_pg, pg->get_osdmap_epoch(), min_epoch);
+ msg->objects.push_back(std::make_pair(obj, item.need));
+ std::ignore = pg->get_shard_services().send_to_osd(
+ peer.osd, std::move(msg), pg->get_osdmap_epoch());
+ }
+ }
+}
+
void PGRecovery::maybe_flush()
{
for (auto& [target, req] : backfill_drop_requests) {
const pg_shard_t& target,
const hobject_t& obj,
const eversion_t& v) final;
+ void send_recovery_deletes(
+ const hobject_t& obj,
+ const std::vector<pg_shard_t>& peers) final;
void maybe_flush() final;
void update_peers_last_backfill(
const hobject_t& new_last_backfill) final;
seastar::future<> wait_for_pushes(pg_shard_t shard) {
return pushes[shard].get_shared_future();
}
+ bool has_pushes() const {
+ return !pushes.empty();
+ }
+ seastar::future<> wait_for_all_pushes() {
+ return seastar::parallel_for_each(pushes,
+ [](auto& entry) {
+ return entry.second.get_shared_future();
+ });
+ }
seastar::future<> wait_for_recovered() {
if (!recovered) {
recovered = seastar::shared_promise<>();
DEBUGDPP("loading obc: {}", pg, soid);
return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
[FNAME, this, soid, need](auto head, auto obc) {
- if (!obc->obs.exists) {
- // XXX: this recovery must be triggered by backfills and the corresponding
- // object must have been deleted by some client request after the object
- // is enqueued for push but before the lock is acquired by the recovery.
- //
- // Abort the recovery in this case, a "recover_delete" must have been
- // added for this object by the client request that deleted it.
- return interruptor::now();
- }
+ if (!obc->obs.exists) {
+ // XXX: this recovery must be triggered by backfills and the corresponding
+ // object must have been deleted by some client request after the object
+ // is enqueued for push but before the lock is acquired by the recovery.
+ //
+ // Abort the recovery in this case. A MOSDPGRecoveryDelete must have been
+ // sent, for this object to peers, by the client request that deleted it.
+ DEBUGDPP("obj={}, v={} not found on primary, aborting backfill", pg, soid, need);
+
+ // if client delete request sent MOSDPGRecoveryDelete, we need to wait
+ // for MOSDPGRecoveryDeleteReply from peers.
+ auto& recovery_waiter = get_recovering(soid);
+ if (recovery_waiter.has_pushes()) {
+ DEBUGDPP("obj={}, v={} waiting for pushes", pg, soid, need);
+ return interruptor::make_interruptible(
+ recovery_waiter.wait_for_all_pushes()
+ ).then_interruptible([this, soid] {
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+ });
+ }
+ return interruptor::now();
+ }
DEBUGDPP("loaded obc: {}", pg, obc->obs.oi.soid);
auto& recovery_waiter = get_recovering(soid);
recovery_waiter.obc = obc;
const hobject_t& obj,
const eversion_t& v) override;
+ void send_recovery_deletes(
+ const hobject_t& obj,
+ const std::vector<pg_shard_t>& peers) override;
+
void maybe_flush() override;
void update_peers_last_backfill(
enqueued_drops[target].emplace_back(obj, v);
}
+void BackfillFixture::send_recovery_deletes(
+ const hobject_t& obj,
+ const std::vector<pg_shard_t>& peers)
+{
+ // no-op in test mock
+}
+
void BackfillFixture::maybe_flush()
{
for (const auto& [target, versioned_objs] : enqueued_drops) {