]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd: remote peering requests wait for OSD activation.
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Tue, 13 Jul 2021 12:09:39 +0000 (12:09 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Tue, 13 Jul 2021 15:02:16 +0000 (15:02 +0000)
Before the patch `RemotePeeringRequest` instances were not
waiting for OSD activation. This was eluding the protection
from handling old, outdated peering events the `MOSDBoot`
machinery offers. The net results are crashes like this one
(`OSDState is booting` has been produced by a custom debug):

```
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,535 [shard 0] ms - [osd.2(cluster) v2:172.21.15.145:6802/2@62336 >> osd
.1 v2:172.21.15.145:6809/2] <== #19 === pg_lease(4.9 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s) e86/86) v1 (133)
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - handle_peering_op on 4.9 from 1
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pg
id=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): start
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pg
id=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): got map 93
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pgid=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): OSDState is booting
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: ERROR 2021-07-07 18:16:30,536 [shard 0] none - /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-5007-g3a9abb02/rpm/el8/BUILD/ceph-17.0.0-5007-g3a9abb02/src/crimson/osd/osd_operations/peering_event.cc:165 : In function 'crimson::osd::RemotePeeringEvent::get_pg()::<lambda()>', ceph_assert(%s)
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: osd.state.is_active()
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: Aborting on shard 0.
```

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/crimson/osd/osd.cc
src/crimson/osd/osd.h
src/crimson/osd/osd_operations/peering_event.cc
src/crimson/osd/osd_operations/peering_event.h

index 17924a9428332550e8eaa2fd878e2fe36618d114..297b3f65e47e92f618037e36216c5253383ab554 100644 (file)
@@ -94,7 +94,8 @@ OSD::OSD(int id, uint32_t nonce,
       update_stats();
     }},
     asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()},
-    osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services)))
+    osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services))),
+    wait_for_active(std::in_place_t{})
 {
   osdmaps[0] = boost::make_local_shared<OSDMap>();
   for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr),
@@ -1067,6 +1068,9 @@ seastar::future<> OSD::committed_osd_maps(version_t first,
       if (state.is_booting()) {
         logger().info("osd.{}: activating...", whoami);
         state.set_active();
+        assert(wait_for_active);
+        wait_for_active->set_value();
+        wait_for_active = std::nullopt;
         beacon_timer.arm_periodic(
           std::chrono::seconds(local_conf()->osd_beacon_report_interval));
         tick_timer.arm_periodic(
index 6addb3b91cc07e910485377a39469cfedd551e3f..ee4f8cd23073e463eea25da80241802954a81960 100644 (file)
@@ -230,6 +230,10 @@ private:
   void update_heartbeat_peers();
   friend class PGAdvanceMap;
 
+  RemotePeeringEvent::OSDPipeline peering_request_osd_pipeline;
+  std::optional<seastar::shared_promise<>> wait_for_active;
+  friend class RemotePeeringEvent;
+
 public:
   blocking_future<Ref<PG>> get_or_create_pg(
     spg_t pgid,
index 02df7eff586a138986e829ab2992474921e9ceb0..7e3f46808d4c13c1701956b2ea38e26c014064e3 100644 (file)
@@ -114,6 +114,11 @@ RemotePeeringEvent::ConnectionPipeline &RemotePeeringEvent::cp()
   return get_osd_priv(conn.get()).peering_request_conn_pipeline;
 }
 
+RemotePeeringEvent::OSDPipeline &RemotePeeringEvent::op()
+{
+  return osd.peering_request_osd_pipeline;
+}
+
 void RemotePeeringEvent::on_pg_absent()
 {
   if (auto& e = get_event().get_event();
@@ -149,8 +154,16 @@ seastar::future<> RemotePeeringEvent::complete_rctx(Ref<PG> pg)
 seastar::future<Ref<PG>> RemotePeeringEvent::get_pg()
 {
   return with_blocking_future(
-    handle.enter(cp().await_map)
+    handle.enter(op().await_active)
   ).then([this] {
+    if (osd.wait_for_active) {
+      return osd.wait_for_active->get_shared_future();
+    } else {
+      return seastar::now();
+    }
+  }).then([this] {
+    return with_blocking_future(handle.enter(cp().await_map));
+  }).then([this] {
     return with_blocking_future(
       osd.osdmap_gate.wait_for_map(evt.get_epoch_sent()));
   }).then([this](auto epoch) {
index 29462c8202b83947d234e8fccd5f27ec05aa4500..9d9478fa5359029c463130e80497f0689bde35f3 100644 (file)
@@ -99,6 +99,12 @@ protected:
   seastar::future<Ref<PG>> get_pg() final;
 
 public:
+  class OSDPipeline {
+    OrderedExclusivePhase await_active = {
+      "PeeringRequest::OSDPipeline::await_active"
+    };
+    friend class RemotePeeringEvent;
+  };
   class ConnectionPipeline {
     OrderedExclusivePhase await_map = {
       "PeeringRequest::ConnectionPipeline::await_map"
@@ -118,6 +124,7 @@ public:
 
 private:
   ConnectionPipeline &cp();
+  OSDPipeline &op();
 };
 
 class LocalPeeringEvent final : public PeeringEvent {