From 23c3f76018b446fb77bbd71fdd33bddfbae9e06d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Rados=C5=82aw=20Zarzy=C5=84ski?= Date: Tue, 17 May 2022 01:41:40 +0200 Subject: [PATCH] mon: fix a race between `mgr fail` and MgrMonitor::prepare_beacon() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There is a race condition between the `mgr fail` handling and `mgrbeacon`. ```diff diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 8ada44e2628..9000b2e0687 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -1203,7 +1203,9 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) } if (changed && pending_map.active_gid == 0) { + dout(5) << "========== changed and active_state == 0" << dendl; promote_standby(); + dout(5) << "========== after promote_standby: " << pending_map.active_gid << dendl; } } else if (prefix == "mgr module enable") { string module; ``` ``` 2022-05-17T00:11:19.602+0200 7f6bd5769700 0 mon.a@0(leader) e1 handle_command mon_command({"prefix": "mgr fail", "who": "x"} v 0) v1 ... 2022-05-17T00:11:19.614+0200 7f6bd5769700 5 mon.a@0(leader).mgr e25 ========== changed and active_state == 0 2022-05-17T00:11:19.614+0200 7f6bd5769700 5 mon.a@0(leader).mgr e25 ========== after promote_standby: 0 2022-05-17T00:11:19.614+0200 7f6bd5769700 4 mon.a@0(leader).mgr e25 prepare_command done, r=0 ... 2022-05-17T00:11:19.630+0200 7f6bd5769700 4 mon.a@0(leader).mgr e25 selecting new active 4210 x (was 0 ) ``` ```cpp bool MgrMonitor::prepare_beacon(MonOpRequestRef op) if (pending_map.active_gid == m->get_gid()) { // ... } else if (pending_map.active_gid == 0) { // There is no currently active daemon, select this one. if (pending_map.standbys.count(m->get_gid())) { drop_standby(m->get_gid(), false); } dout(4) << "selecting new active " << m->get_gid() << " " << m->get_name() << " (was " << pending_map.active_gid << " " << pending_map.active_name << ")" << dendl; pending_map.active_gid = m->get_gid(); pending_map.active_name = m->get_name(); pending_map.active_change = ceph_clock_now() ``` The `25` version of `MgrMap`, when handled at `mgr.x`, doesn't trigger the `respawn()` path: ``` 2022-05-17T00:10:11.197+0200 7fa3d1e0a700 10 mgr ms_dispatch2 active mgrmap(e 25) v1 2022-05-17T00:10:11.197+0200 7fa3d1e0a700 4 mgr handle_mgr_map received map epoch 25 2022-05-17T00:10:11.197+0200 7fa3d1e0a700 4 mgr handle_mgr_map active in map: 1 active is 4210 2022-05-17T00:10:11.197+0200 7fa3d6613700 10 --2- 127.0.0.1:0/743576734 >> [v2:127.0.0.1:40929/0,v1:127.0.0.1:40930/0] conn(0x5592635ef400 0x5592635f6580 secure :-1 s=THROTTLE_DONE pgs=130 cs=0 l=1 rev1=1 crypto rx=0x55926362e810 tx=0x559263563b60 comp rx=0 tx=0).handle_read_frame_dispatch tag=17 2022-05-17T00:10:11.197+0200 7fa3d6613700 5 --2- 127.0.0.1:0/743576734 >> [v2:127.0.0.1:40929/0,v1:127.0.0.1:40930/0] conn(0x5592635ef400 0x5592635f6580 secure :-1 s=THROTTLE_DONE pgs=130 cs=0 l=1 rev1=1 crypto rx=0x55926362e810 tx=0x559263563b60 comp rx=0 tx=0).handle_message got 43089 + 0 + 0 byte message. envelope type=1796 src mon.0 off 0 2022-05-17T00:10:11.197+0200 7fa3d1e0a700 10 mgr handle_mgr_map I was already active ``` Fixes: https://tracker.ceph.com/issues/55711 Signed-off-by: Radosław Zarzyński --- src/mon/MgrMonitor.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc index 86fa755bc5ae6..c394d3ee76921 100644 --- a/src/mon/MgrMonitor.cc +++ b/src/mon/MgrMonitor.cc @@ -1144,9 +1144,7 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op) string format = cmd_getval_or(cmdmap, "format", "plain"); boost::scoped_ptr f(Formatter::create(format)); - string prefix; - cmd_getval(cmdmap, "prefix", prefix); - + const auto prefix = cmd_getval_or(cmdmap, "prefix", string{}); int r = 0; if (prefix == "mgr fail") { @@ -1274,6 +1272,9 @@ out: getline(ss, rs); if (r >= 0) { + if (prefix == "mgr fail" && is_writeable()) { + propose_pending(); + } // success.. delay reply wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs, get_last_committed() + 1)); -- 2.39.5