From 5e1b33545a80a19ac074103315af56b508836372 Mon Sep 17 00:00:00 2001 From: Mohit Agrawal Date: Thu, 16 Oct 2025 16:28:08 +0530 Subject: [PATCH] crimson: Move send_to_osd call outside with_obc lambda The with_obc() function acquires a lock before invoking the lambda it wraps. Earlier the lambda itself called send_to_osd() which returns a future to with_obc. If a future is not resolved immediately and a response could arrive and trigger handle_pull_response() which attempts to acquire an exclusive lock. Because a future is not returned yet to with_obc() so the original lock is still holding by with_obc and handle_pull_response() throw an assertion failure due to that osd is crashed. Solution: Move send_to_osd() call outside with_obc lambda so that the lock is released before handle_pull_response() is triggered. Fixed: https://tracker.ceph.com/issues/71861 Signed-off-by: Mohit Agrawal --- .../osd/replicated_recovery_backend.cc | 54 +++++++++++-------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc index 226b66f1db2..72922578d19 100644 --- a/src/crimson/osd/replicated_recovery_backend.cc +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -132,29 +132,37 @@ ReplicatedRecoveryBackend::maybe_pull_missing_obj( using prepare_pull_iertr = crimson::osd::ObjectContextLoader::load_obc_iertr::extend< crimson::ct_error::eagain>; - return pg.obc_loader.with_obc(soid.get_head(), - [this, soid, need](auto head, auto) { - PullOp pull_op; - auto& recovery_waiter = get_recovering(soid); - recovery_waiter.pull_info = - std::make_optional(); - auto& pull_info = *recovery_waiter.pull_info; - prepare_pull(head, pull_op, pull_info, soid, need); - auto msg = crimson::make_message(); - msg->from = pg.get_pg_whoami(); - msg->set_priority(pg.get_recovery_op_priority()); - msg->pgid = pg.get_pgid(); - msg->map_epoch = pg.get_osdmap_epoch(); - msg->min_epoch = pg.get_last_peering_reset(); - msg->set_pulls({std::move(pull_op)}); - return shard_services.send_to_osd( - pull_info.from.osd, - std::move(msg), - pg.get_osdmap_epoch()); - }).si_then([this, soid]() -> prepare_pull_iertr::future<> { - auto& recovery_waiter = get_recovering(soid); - return recovery_waiter.wait_for_pull(); - }); + + return seastar::do_with( + PullOp{}, + [this, soid, need](PullOp &pull_op) { + return pg.obc_loader.with_obc(soid.get_head(), + [this, soid, need, &pull_op](auto head, auto) { + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.pull_info.emplace(); + auto &pull_info = *recovery_waiter.pull_info; + prepare_pull(head, pull_op, pull_info, soid, need); + return seastar::now(); + }).si_then([this, soid, &pull_op]() { + auto& recovery_waiter = get_recovering(soid); + auto &pull_info = *recovery_waiter.pull_info; + auto msg = crimson::make_message(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(pull_op)}); + return shard_services.send_to_osd( + pull_info.from.osd, + std::move(msg), + pg.get_osdmap_epoch() + ).then([this, soid]() -> prepare_pull_iertr::future<> { + auto& recovery_waiter = get_recovering(soid); + return recovery_waiter.wait_for_pull(); + }); + }); + }); }).handle_error_interruptible( crimson::ct_error::assert_all(fmt::format( "{} {} error with {} need {} ", pg, FNAME, soid, need).c_str()) -- 2.39.5