From fd9c40465bd6d99c95d724c730e1e170b0a7ebdd Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 14 Oct 2024 19:52:23 +0200 Subject: [PATCH] mon/MDSMonitor: send reply to beacons with `state=DNE` During shutdown, the MDS sends a `MSG_MDS_BEACON` with `MDSMap::STATE_DNE` (in `MDSDaemon::suicide()`) and then waits for a `MSG_MDS_BEACON` reply from the MON. The MON, however, suppresses replies to `STATE_DNE`; in `MDSMonitor::preprocess_beacon()`, it returns early on `STATE_DNE` and `MDSMonitor::prepare_beacon()` silently evicts the dying MDS without any reply. This delays the MDS shutdown until the MDS times out. Since `MDSDaemon::suicide()` has code to wait for a beacon reply, I figure that the MON reply was suppressed accidently, therefore I suggest adding it. Fixes: https://tracker.ceph.com/issues/68761 Signed-off-by: Max Kellermann --- src/mon/MDSMonitor.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index d8cca4ceb61..f742303c6e9 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -758,6 +758,14 @@ bool MDSMonitor::prepare_beacon(MonOpRequestRef op) if (state == MDSMap::STATE_DNE) { dout(1) << __func__ << ": DNE from " << info << dendl; + + /* send a beacon reply so MDSDaemon::suicide() finishes the + Beacon::send_and_wait() call */ + auto beacon = make_message(mon.monmap->fsid, + m->get_global_id(), m->get_name(), get_fsmap().get_epoch(), + m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT); + mon.send_reply(op, beacon.detach()); + goto evict; } -- 2.39.5