]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd/osd_operations/client_request: hang client requests when the
authorXuehan Xu <xuxuehan@qianxin.com>
Thu, 20 Jun 2024 05:26:53 +0000 (13:26 +0800)
committerMatan Breizman <mbreizma@redhat.com>
Wed, 24 Jul 2024 08:32:05 +0000 (08:32 +0000)
object is missing in the whole cluster

Fixes: https://tracker.ceph.com/issues/65696
Signed-off-by: Xuehan Xu <xuxuehan@qianxin.com>
src/crimson/osd/osd_operations/client_request.cc
src/crimson/osd/osd_operations/client_request_common.cc
src/crimson/osd/osd_operations/client_request_common.h
src/crimson/osd/osd_operations/internal_client_request.cc
src/crimson/osd/pg_recovery.cc
src/crimson/osd/recovery_backend.h

index 33d0f4c0f41282719bfc92316b8a465e1bbce270..37efecb11b81e82e99c1fb9fd828149daa551fd2 100644 (file)
@@ -286,6 +286,7 @@ ClientRequest::recover_missing_snaps(
   ObjectContextRef head,
   std::set<snapid_t> &snaps)
 {
+  LOG_PREFIX(ClientRequest::process_op);
   co_await ihref.enter_stage<interruptor>(
     client_pp(*pg).recover_missing_snaps, *this);
   for (auto &snap : snaps) {
@@ -299,7 +300,12 @@ ClientRequest::recover_missing_snaps(
      * we skip the oid as there is no corresponding clone to recover.
      * See https://tracker.ceph.com/issues/63821 */
     if (oid) {
-      co_await do_recover_missing(pg, *oid, m->get_reqid());
+      auto unfound = co_await do_recover_missing(pg, *oid, m->get_reqid());
+      if (unfound) {
+        DEBUGDPP("{} unfound, hang it for now", *pg, m->get_hobj().get_head());
+        co_await interruptor::make_interruptible(
+          pg->get_recovery_backend()->add_unfound(m->get_hobj().get_head()));
+      }
     }
   }
 }
@@ -317,7 +323,14 @@ ClientRequest::process_op(
       "Skipping recover_missings on non primary pg for soid {}",
       *pg, m->get_hobj());
   } else {
-    co_await do_recover_missing(pg, m->get_hobj().get_head(), m->get_reqid());
+    auto unfound = co_await do_recover_missing(
+      pg, m->get_hobj().get_head(), m->get_reqid());
+    if (unfound) {
+      DEBUGDPP("{} unfound, hang it for now", *pg, m->get_hobj().get_head());
+      co_await interruptor::make_interruptible(
+        pg->get_recovery_backend()->add_unfound(m->get_hobj().get_head()));
+    }
+
     std::set<snapid_t> snaps = snaps_need_to_recover();
     if (!snaps.empty()) {
       // call with_obc() in order, but wait concurrently for loading.
index c4439d5bb35d069f3036f296bfbe6063adaca42a..a56d58d2066c8e4e5c3770bf04337879f015fe74 100644 (file)
@@ -13,7 +13,7 @@ namespace {
 
 namespace crimson::osd {
 
-typename InterruptibleOperation::template interruptible_future<>
+typename InterruptibleOperation::template interruptible_future<bool>
 CommonClientRequest::do_recover_missing(
   Ref<PG> pg,
   const hobject_t& soid,
@@ -45,22 +45,29 @@ CommonClientRequest::do_recover_missing(
   if (!needs_recovery_or_backfill) {
     logger().debug("{} reqid {} nothing to recover {}",
                    __func__, reqid, soid);
-    return seastar::now();
+    return seastar::make_ready_future<bool>(false);
   }
 
+  if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) {
+    return seastar::make_ready_future<bool>(true);
+  }
   logger().debug("{} reqid {} need to wait for recovery, {} version {}",
                  __func__, reqid, soid, ver);
   if (pg->get_recovery_backend()->is_recovering(soid)) {
     logger().debug("{} reqid {} object {} version {}, already recovering",
                    __func__, reqid, soid, ver);
-    return pg->get_recovery_backend()->get_recovering(soid).wait_for_recovered();
+    return pg->get_recovery_backend()->get_recovering(
+      soid).wait_for_recovered(
+    ).then([] {
+      return seastar::make_ready_future<bool>(false);
+    });
   } else {
     logger().debug("{} reqid {} object {} version {}, starting recovery",
                    __func__, reqid, soid, ver);
     auto [op, fut] =
       pg->get_shard_services().start_operation<UrgentRecovery>(
         soid, ver, pg, pg->get_shard_services(), pg->get_osdmap_epoch());
-    return std::move(fut);
+    return fut.then([] { return seastar::make_ready_future<bool>(false); });
   }
 }
 
index 85f118d64c16fd527e7f59eddcaba7523dd6ef5b..951bf653799e7247044f5300184ccda148653fbd 100644 (file)
@@ -11,7 +11,7 @@ namespace crimson::osd {
 
 struct CommonClientRequest {
 
-  static InterruptibleOperation::template interruptible_future<>
+  static InterruptibleOperation::template interruptible_future<bool>
   do_recover_missing(
     Ref<PG> pg,
     const hobject_t& soid,
index 22d7f3e492ad4865a72b353b76ce30e7c171e01b..2968a6f4385987ff0fd6e3cd88c06df10aea61d0 100644 (file)
@@ -70,7 +70,12 @@ seastar::future<> InternalClientRequest::start()
             client_pp().recover_missing);
         }).then_interruptible([this] {
           return do_recover_missing(pg, get_target_oid(), osd_reqid_t());
-        }).then_interruptible([this] {
+        }).then_interruptible([this](bool unfound) {
+          if (unfound) {
+            throw std::system_error(
+              std::make_error_code(std::errc::operation_canceled),
+              fmt::format("{} is unfound, drop it!", get_target_oid()));
+          }
           return enter_stage<interruptor>(
             client_pp().get_obc);
         }).then_interruptible([this] () -> PG::load_obc_iertr::future<> {
@@ -128,6 +133,9 @@ seastar::future<> InternalClientRequest::start()
       }, pg, start_epoch);
     }).then([this] {
       track_event<CompletionEvent>();
+    }).handle_exception_type([](std::system_error &error) {
+      logger().debug("error {}, message: {}", error.code(), error.what());
+      return seastar::now();
     }).finally([this] {
       logger().debug("{}: exit", *this);
       handle.exit();
index 05f8c6e1f968af0f5a6e15dcb648b4bcb402a943..4ec68729607439cb3f8963d7ce1898b251274268 100644 (file)
@@ -431,6 +431,7 @@ void PGRecovery::on_global_recover (
   auto& recovery_waiter = pg->get_recovery_backend()->get_recovering(soid);
   recovery_waiter.set_recovered();
   pg->get_recovery_backend()->remove_recovering(soid);
+  pg->get_recovery_backend()->found_and_remove(soid);
 }
 
 void PGRecovery::on_failed_recover(
index f5a365c155883e2b55770c090673834af41fd960..1225a920a1dc2edcf3ed6c580fd87cbf54cbe5c8 100644 (file)
@@ -50,6 +50,18 @@ public:
     assert(it->second);
     return {*(it->second), added};
   }
+  seastar::future<> add_unfound(const hobject_t &soid) {
+    auto [it, added] = unfound.emplace(soid, seastar::shared_promise());
+    return it->second.get_shared_future();
+  }
+  void found_and_remove(const hobject_t &soid) {
+    auto it = unfound.find(soid);
+    if (it != unfound.end()) {
+      auto &found_promise = it->second;
+      found_promise.set_value();
+      unfound.erase(it);
+    }
+  }
   WaitForObjectRecovery& get_recovering(const hobject_t& soid) {
     assert(is_recovering(soid));
     return *(recovering.at(soid));
@@ -91,6 +103,10 @@ public:
     for (auto& [soid, recovery_waiter] : recovering) {
       recovery_waiter->stop();
     }
+    for (auto& [soid, promise] : unfound) {
+      promise.set_exception(
+       crimson::common::system_shutdown_exception());
+    }
     return on_stop();
   }
 protected:
@@ -236,6 +252,7 @@ public:
   using WaitForObjectRecoveryRef = boost::intrusive_ptr<WaitForObjectRecovery>;
 protected:
   std::map<hobject_t, WaitForObjectRecoveryRef> recovering;
+  std::map<hobject_t, seastar::shared_promise<>> unfound;
   hobject_t get_temp_recovery_object(
     const hobject_t& target,
     eversion_t version) const;