From 1f1051d91f320636f513faeb2d113542bcb9e5c3 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Sun, 29 Sep 2024 17:26:04 +0800 Subject: [PATCH] crimson/osd/pg_shard_manager: discard outdated operations when the corresponding pgs are already removed Fixes: https://tracker.ceph.com/issues/68286 Signed-off-by: Xuehan Xu --- src/crimson/osd/osd_operation.h | 3 ++ .../osd/osd_operations/client_request.h | 4 ++ .../osd/osd_operations/logmissing_request.h | 3 ++ .../osd_operations/logmissing_request_reply.h | 3 ++ .../osd/osd_operations/peering_event.h | 8 ++++ .../osd/osd_operations/pg_advance_map.h | 4 ++ .../osd/osd_operations/recovery_subrequest.h | 3 ++ .../osd/osd_operations/replicated_request.h | 3 ++ src/crimson/osd/osd_operations/scrub_events.h | 12 ++++- src/crimson/osd/pg_shard_manager.h | 46 ++++++++++++++----- src/crimson/osd/shard_services.cc | 5 ++ src/crimson/osd/shard_services.h | 2 + 12 files changed, 82 insertions(+), 14 deletions(-) diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h index 2897a7e16237a..8f525c6a8a423 100644 --- a/src/crimson/osd/osd_operation.h +++ b/src/crimson/osd/osd_operation.h @@ -211,6 +211,9 @@ protected: public: static constexpr bool is_trackable = true; + virtual bool requires_pg() const { + return true; + } }; template diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h index 98443bdfc0f73..91a6728fd4bcf 100644 --- a/src/crimson/osd/osd_operations/client_request.h +++ b/src/crimson/osd/osd_operations/client_request.h @@ -42,6 +42,10 @@ class ClientRequest final : public PhasedOperationT, unsigned instance_id = 0; public: + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } + /** * instance_handle_t * diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h index e12243ce430fd..fe4761c4ab482 100644 --- a/src/crimson/osd/osd_operations/logmissing_request.h +++ b/src/crimson/osd/osd_operations/logmissing_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h index 71651d16789bc..bdb6c2ac6acdd 100644 --- a/src/crimson/osd/osd_operations/logmissing_request_reply.h +++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h index 85de5c711d67c..aa6b8a95a94ae 100644 --- a/src/crimson/osd/osd_operations/peering_event.h +++ b/src/crimson/osd/osd_operations/peering_event.h @@ -44,6 +44,10 @@ protected: float delay = 0; PGPeeringEvent evt; + epoch_t get_epoch_sent_at() const { + return evt.get_epoch_sent(); + } + const pg_shard_t get_from() const { return from; } @@ -84,6 +88,10 @@ public: evt(std::forward(args)...) {} + bool requires_pg() const final { + return evt.requires_pg; + } + void print(std::ostream &) const final; void dump_detail(ceph::Formatter* f) const final; seastar::future<> with_pg( diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h index 43be7319545b1..21702f6ff4f76 100644 --- a/src/crimson/osd/osd_operations/pg_advance_map.h +++ b/src/crimson/osd/osd_operations/pg_advance_map.h @@ -50,6 +50,10 @@ public: PGPeeringPipeline::Process::BlockingEvent > tracking_events; + epoch_t get_epoch_sent_at() const { + return to; + } + private: PGPeeringPipeline &peering_pp(PG &pg); }; diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h index 17c2faf97ea98..2fe8ff372b3f9 100644 --- a/src/crimson/osd/osd_operations/recovery_subrequest.h +++ b/src/crimson/osd/osd_operations/recovery_subrequest.h @@ -39,6 +39,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return m->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return m->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h index 1e84fd108e23e..05724943cf040 100644 --- a/src/crimson/osd/osd_operations/replicated_request.h +++ b/src/crimson/osd/osd_operations/replicated_request.h @@ -36,6 +36,9 @@ public: } PipelineHandle &get_handle() { return handle; } epoch_t get_epoch() const { return req->get_min_epoch(); } + epoch_t get_epoch_sent_at() const { + return req->get_map_epoch(); + } ConnectionPipeline &get_connection_pipeline(); diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h index 02a5d852bb7c2..8bed90e4c14fb 100644 --- a/src/crimson/osd/osd_operations/scrub_events.h +++ b/src/crimson/osd/osd_operations/scrub_events.h @@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT { crimson::net::ConnectionRef l_conn; crimson::net::ConnectionXcoreRef r_conn; - epoch_t epoch; spg_t pgid; protected: using interruptor = InterruptibleOperation::interruptor; + epoch_t epoch; template using ifut = InterruptibleOperation::interruptible_future; @@ -40,7 +40,7 @@ protected: public: RemoteScrubEventBaseT( crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid) - : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {} + : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {} PGPeeringPipeline &get_peering_pipeline(PG &pg); @@ -117,6 +117,10 @@ public: : RemoteScrubEventBaseT(std::forward(base_args)...), deep(deep) {} + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(deep=" << deep << ")"; } @@ -141,6 +145,10 @@ public: ceph_assert(scrub::PGScrubber::is_scrub_message(*m)); } + epoch_t get_epoch_sent_at() const { + return epoch; + } + void print(std::ostream &out) const final { out << "(m=" << *m << ")"; } diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h index b9879c8c9ddde..f7bd7a6c08e30 100644 --- a/src/crimson/osd/pg_shard_manager.h +++ b/src/crimson/osd/pg_shard_manager.h @@ -256,18 +256,40 @@ public: auto &opref = *op; return opref.template with_blocking_event< PGMap::PGCreationBlockingEvent - >([&target_shard_services, &opref](auto &&trigger) { - return target_shard_services.wait_for_pg( - std::move(trigger), opref.get_pgid()); - }).safe_then([&logger, &target_shard_services, &opref](Ref pgref) { - logger.debug("{}: have_pg", opref); - return opref.with_pg(target_shard_services, pgref); - }).handle_error( - crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { - logger.debug("{}: pg creation canceled, dropping", opref); - return seastar::now(); - }) - ).then([op=std::move(op)] {}); + >([&target_shard_services, &opref, &logger](auto &&trigger) mutable { + auto pg = target_shard_services.get_pg(opref.get_pgid()); + auto fut = ShardServices::wait_for_pg_ertr::make_ready_future>(pg); + if (!pg) { + if (opref.requires_pg()) { + auto osdmap = target_shard_services.get_map(); + if (!osdmap->is_up_acting_osd_shard( + opref.get_pgid(), target_shard_services.local_state.whoami)) { + logger.debug( + "pg {} for {} is no longer here, discarding", + opref.get_pgid(), opref); + opref.get_handle().exit(); + auto _fut = seastar::now(); + if (osdmap->get_epoch() > opref.get_epoch_sent_at()) { + _fut = target_shard_services.send_incremental_map( + std::ref(opref.get_foreign_connection()), + opref.get_epoch_sent_at() + 1); + } + return _fut; + } + } + fut = target_shard_services.wait_for_pg( + std::move(trigger), opref.get_pgid()); + } + return fut.safe_then([&logger, &target_shard_services, &opref](Ref pgref) { + logger.debug("{}: have_pg", opref); + return opref.with_pg(target_shard_services, pgref); + }).handle_error( + crimson::ct_error::ecanceled::handle([&logger, &opref](auto) { + logger.debug("{}: pg creation canceled, dropping", opref); + return seastar::now(); + }) + ); + }).then([op=std::move(op)] {}); } seastar::future<> load_pgs(crimson::os::FuturizedStore& store); diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc index c23408989293d..e1acb34636f2d 100644 --- a/src/crimson/osd/shard_services.cc +++ b/src/crimson/osd/shard_services.cc @@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction( co_return; } +Ref ShardServices::get_pg(spg_t pgid) +{ + return local_state.get_pg(pgid); +} + seastar::future<> ShardServices::dispatch_context_messages( BufferedRecoveryMessages &&ctx) { diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h index 56ac4963fff71..f4d4b4c2eb4f5 100644 --- a/src/crimson/osd/shard_services.h +++ b/src/crimson/osd/shard_services.h @@ -483,6 +483,8 @@ public: return pg_to_shard_mapping.remove_pg_mapping(pgid); } + Ref get_pg(spg_t pgid); + crimson::common::CephContext *get_cct() { return &(local_state.cct); } -- 2.39.5