From 4b9def89f1488b2d38d44c59a8f714a3f3495e6c Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 13 Aug 2024 15:32:02 +0800 Subject: [PATCH] crimson/osd/recovery_backend: restart object pulling for recoveries that are blocked pulling from down osds Fixes: https://tracker.ceph.com/issues/67508 Signed-off-by: Xuehan Xu --- src/crimson/osd/pg.h | 8 ++- src/crimson/osd/recovery_backend.h | 14 ++++++ .../osd/replicated_recovery_backend.cc | 49 ++++++++++--------- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 58e3db938f679..d4d6d507110c8 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -357,7 +357,13 @@ public: shard_services.remove_want_pg_temp(orderer, pgid.pgid); } void check_recovery_sources(const OSDMapRef& newmap) final { - // Not needed yet + recovery_backend->for_each_recovery_waiter( + [newmap, FNAME](auto &, auto &waiter) { + if (waiter->is_pulling() && + newmap->is_down(waiter->pull_info->from.osd)) { + waiter->repeat_pull(); + } + }); } void check_blocklisted_watchers() final; void clear_primary_state() final { diff --git a/src/crimson/osd/recovery_backend.h b/src/crimson/osd/recovery_backend.h index b404b79751e7b..21154cb710679 100644 --- a/src/crimson/osd/recovery_backend.h +++ b/src/crimson/osd/recovery_backend.h @@ -112,6 +112,13 @@ public: } return on_stop(); } + + template + void for_each_recovery_waiter(Func &&f) { + for (auto &[soid, recovery_waiter] : recovering) { + std::forward(f)(soid, recovery_waiter); + } + } protected: crimson::osd::PG& pg; crimson::osd::ShardServices& shard_services; @@ -219,6 +226,13 @@ public: pulled.reset(); } } + void repeat_pull() { + ceph_assert(pulled); + pulled->set_exception(crimson::ct_error::eagain::exception_ptr()); + } + bool is_pulling() const { + return (bool)pulled; + } void set_push_failed(pg_shard_t shard, std::exception_ptr e) { auto it = pushes.find(shard); if (it != pushes.end()) { diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc index f59d2f1757d0e..76f24196b51f7 100644 --- a/src/crimson/osd/replicated_recovery_backend.cc +++ b/src/crimson/osd/replicated_recovery_backend.cc @@ -113,28 +113,33 @@ ReplicatedRecoveryBackend::maybe_pull_missing_obj( // object is not missing, don't pull return seastar::make_ready_future<>(); } - return pg.obc_loader.with_obc(soid.get_head(), - [this, soid, need](auto head, auto) { - PullOp pull_op; - auto& recovery_waiter = get_recovering(soid); - recovery_waiter.pull_info = - std::make_optional(); - auto& pull_info = *recovery_waiter.pull_info; - prepare_pull(head, pull_op, pull_info, soid, need); - auto msg = crimson::make_message(); - msg->from = pg.get_pg_whoami(); - msg->set_priority(pg.get_recovery_op_priority()); - msg->pgid = pg.get_pgid(); - msg->map_epoch = pg.get_osdmap_epoch(); - msg->min_epoch = pg.get_last_peering_reset(); - msg->set_pulls({std::move(pull_op)}); - return shard_services.send_to_osd( - pull_info.from.osd, - std::move(msg), - pg.get_osdmap_epoch()); - }).si_then([this, soid] { - auto& recovery_waiter = get_recovering(soid); - return recovery_waiter.wait_for_pull(); + return interruptor::repeat_eagain([this, soid, need] { + using prepare_pull_iertr = + crimson::osd::ObjectContextLoader::load_obc_iertr::extend< + crimson::ct_error::eagain>; + return pg.obc_loader.with_obc(soid.get_head(), + [this, soid, need](auto head, auto) { + PullOp pull_op; + auto& recovery_waiter = get_recovering(soid); + recovery_waiter.pull_info = + std::make_optional(); + auto& pull_info = *recovery_waiter.pull_info; + prepare_pull(head, pull_op, pull_info, soid, need); + auto msg = crimson::make_message(); + msg->from = pg.get_pg_whoami(); + msg->set_priority(pg.get_recovery_op_priority()); + msg->pgid = pg.get_pgid(); + msg->map_epoch = pg.get_osdmap_epoch(); + msg->min_epoch = pg.get_last_peering_reset(); + msg->set_pulls({std::move(pull_op)}); + return shard_services.send_to_osd( + pull_info.from.osd, + std::move(msg), + pg.get_osdmap_epoch()); + }).si_then([this, soid]() -> prepare_pull_iertr::future<> { + auto& recovery_waiter = get_recovering(soid); + return recovery_waiter.wait_for_pull(); + }); }).handle_error_interruptible( crimson::ct_error::assert_all("unexpected error") ); -- 2.39.5