Before the patch `RemotePeeringRequest` instances were not
waiting for OSD activation. This was eluding the protection
from handling old, outdated peering events the `MOSDBoot`
machinery offers. The net results are crashes like this one
(`OSDState is booting` has been produced by a custom debug):
```
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,535 [shard 0] ms - [osd.2(cluster) v2:172.21.15.145:6802/2@62336 >> osd
.1 v2:172.21.15.145:6809/2] <== #19 === pg_lease(4.9 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s) e86/86) v1 (133)
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - handle_peering_op on 4.9 from 1
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pg
id=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): start
2021-07-07T18:20:23.293 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pg
id=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): got map 93
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: DEBUG 2021-07-07 18:16:30,536 [shard 0] osd - peering_event(id=125, detail=PeeringEvent(from=1 pgid=4.9 sent=86 requested=86 evt=epoch_sent: 86 epoch_requested: 86 MLease epoch 86 from osd.1 pg_lease(ru 60.120281219s ub 68.121276855s int 16.000000000s))): OSDState is booting
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: ERROR 2021-07-07 18:16:30,536 [shard 0] none - /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/
17.0.0-5007-g3a9abb02/rpm/el8/BUILD/
ceph-17.0.0-5007-g3a9abb02/src/crimson/osd/osd_operations/peering_event.cc:165 : In function 'crimson::osd::RemotePeeringEvent::get_pg()::<lambda()>', ceph_assert(%s)
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: osd.state.is_active()
2021-07-07T18:20:23.294 INFO:journalctl@ceph.osd.2.smithi145.stdout:Jul 07 18:16:30 smithi145 conmon[71083]: Aborting on shard 0.
```
Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
update_stats();
}},
asok{seastar::make_lw_shared<crimson::admin::AdminSocket>()},
- osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services)))
+ osdmap_gate("OSD::osdmap_gate", std::make_optional(std::ref(shard_services))),
+ wait_for_active(std::in_place_t{})
{
osdmaps[0] = boost::make_local_shared<OSDMap>();
for (auto msgr : {std::ref(cluster_msgr), std::ref(public_msgr),
if (state.is_booting()) {
logger().info("osd.{}: activating...", whoami);
state.set_active();
+ assert(wait_for_active);
+ wait_for_active->set_value();
+ wait_for_active = std::nullopt;
beacon_timer.arm_periodic(
std::chrono::seconds(local_conf()->osd_beacon_report_interval));
tick_timer.arm_periodic(
void update_heartbeat_peers();
friend class PGAdvanceMap;
+ RemotePeeringEvent::OSDPipeline peering_request_osd_pipeline;
+ std::optional<seastar::shared_promise<>> wait_for_active;
+ friend class RemotePeeringEvent;
+
public:
blocking_future<Ref<PG>> get_or_create_pg(
spg_t pgid,
return get_osd_priv(conn.get()).peering_request_conn_pipeline;
}
+RemotePeeringEvent::OSDPipeline &RemotePeeringEvent::op()
+{
+ return osd.peering_request_osd_pipeline;
+}
+
void RemotePeeringEvent::on_pg_absent()
{
if (auto& e = get_event().get_event();
seastar::future<Ref<PG>> RemotePeeringEvent::get_pg()
{
return with_blocking_future(
- handle.enter(cp().await_map)
+ handle.enter(op().await_active)
).then([this] {
+ if (osd.wait_for_active) {
+ return osd.wait_for_active->get_shared_future();
+ } else {
+ return seastar::now();
+ }
+ }).then([this] {
+ return with_blocking_future(handle.enter(cp().await_map));
+ }).then([this] {
return with_blocking_future(
osd.osdmap_gate.wait_for_map(evt.get_epoch_sent()));
}).then([this](auto epoch) {
seastar::future<Ref<PG>> get_pg() final;
public:
+ class OSDPipeline {
+ OrderedExclusivePhase await_active = {
+ "PeeringRequest::OSDPipeline::await_active"
+ };
+ friend class RemotePeeringEvent;
+ };
class ConnectionPipeline {
OrderedExclusivePhase await_map = {
"PeeringRequest::ConnectionPipeline::await_map"
private:
ConnectionPipeline &cp();
+ OSDPipeline &op();
};
class LocalPeeringEvent final : public PeeringEvent {