From f8aac3c743c8d006507ab2bac715700e9d07060a Mon Sep 17 00:00:00 2001 From: Kamoltat Sirivadhna Date: Thu, 20 Mar 2025 14:44:28 +0000 Subject: [PATCH] mon: Track and process pending pings after election Problem: Monitors stop pinging each other when quorum_mon_feature flag is empty. This happens when the monitor freshly starts up and never formed a quorum before, or when you restart the monitors in the cluster. Basically, Monitor startups. This problem can easily be reproduced everytime. Steps to reproduce: 1. Start 3 MONs with `connectivity` election strategy 2. Fail 1 mon. 3. Restart all the monitors (including the down monitor) 4. Observe that the connection scores of each monitor will tell you that not all monitors are alive. Which is not true because all 3 Monitors are in quorum. What happened was during monitor startups, quorum_mon_feature is empty and although they all participated in the election, when they hit the function begin_peer_ping, some monitors if not all will not send ping because of the emptry quorum_mon_feature flag. Therefore, after the election the monitors will have the wrong connection score. However, this will get resolved in the next election because now that quorum_mon_feature is populated they will start pinging each other again, hence, correct connectivity score. Solution: In begin_peer_ping, instead of just returning out of the function when quorum_mon_feature is empty, we keep track of the peers that we should ping once the election is finished. In Monitor::win_election and Monitor::lose_elections, we process the pending pings by calling begin_peer_ping on each of the peers (both peons and leader) Additionally: Improved loggings in Elector class such that debugging the pinging process gets easier. Fixes: https://tracker.ceph.com/issues/70587 Signed-off-by: Kamoltat Sirivadhna sd Signed-off-by: Kamoltat Sirivadhna (cherry picked from commit eadcb8ced35d9aa011828e8a0b69d7b2e168f934) --- src/mon/Elector.cc | 77 +++++++++++++++++++++++++++++++++++++--------- src/mon/Elector.h | 9 ++++++ src/mon/Monitor.cc | 4 +-- 3 files changed, 74 insertions(+), 16 deletions(-) diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index af0a701dcf511..d224840246190 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -453,22 +453,37 @@ void Elector::handle_nak(MonOpRequestRef op) void Elector::begin_peer_ping(int peer) { - dout(20) << __func__ << " against " << peer << dendl; + dout(20) << __func__ << " with " << peer << dendl; if (live_pinging.count(peer)) { dout(20) << peer << " already in live_pinging ... return " << dendl; return; } - - if (!mon->get_quorum_mon_features().contains_all( + // Check if quorum feature is not set and we are in + // STATE_INIT, STATE_PROBING, STATE_SYNCHRONIZING or STATE_ELECTING + if (mon->get_quorum_mon_features().empty() && + (!mon->is_leader() && !mon->is_peon() && !mon->is_shutdown())) { + dout(10) << "quorum mon feature is not yet set, " + << " we might need to wait until we form a quorum" + << dendl; + pending_pings.insert(peer); + return; + } else if (!mon->get_quorum_mon_features().contains_all( ceph::features::mon::FEATURE_PINGING)) { + dout(10) << "mon quorum does not support pinging .. return" << dendl; return; } + pending_pings.erase(peer); peer_tracker.report_live_connection(peer, 0); // init this peer as existing live_pinging.insert(peer); dead_pinging.erase(peer); peer_acked_ping[peer] = ceph_clock_now(); - if (!send_peer_ping(peer)) return; + if (!send_peer_ping(peer)) { + dout(20) << "send_peer_ping failed ..." + << " no need to schedule ping_check" << dendl; + return; + } + dout(30) << "schedule ping_check against peer: " << peer << dendl; mon->timer.add_event_after(ping_timeout / PING_DIVISOR, new C_MonContext{mon, [this, peer](int) { ping_check(peer); @@ -497,22 +512,37 @@ bool Elector::send_peer_ping(int peer, const utime_t *n) MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl()); mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer)); peer_sent_ping[peer] = now; + dout(20) << " sent ping successfully to peer: " << peer << dendl; return true; } +void Elector::process_pending_pings() +{ + dout(10) << __func__ << " processing " + << pending_pings.size() << " pending pings" << dendl; + + // Make a copy since begin_peer_ping will modify the set + std::set peers_to_ping = pending_pings; + for (int peer : peers_to_ping) { + begin_peer_ping(peer); + } +} + void Elector::ping_check(int peer) { - dout(20) << __func__ << " to peer " << peer << dendl; + dout(20) << __func__ << "ing peer " << peer << dendl; if (!live_pinging.count(peer) && !dead_pinging.count(peer)) { - dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl; + dout(20) << peer << " is no longer marked for pinging ... return" << dendl; return; } utime_t now = ceph_clock_now(); utime_t& acked_ping = peer_acked_ping[peer]; utime_t& newest_ping = peer_sent_ping[peer]; if (!acked_ping.is_zero() && acked_ping < now - ping_timeout) { + dout(20) << "peer " << peer << " has not acked a ping in " + << now - acked_ping << " seconds" << dendl; peer_tracker.report_dead_connection(peer, now - acked_ping); acked_ping = now; begin_dead_ping(peer); @@ -520,9 +550,17 @@ void Elector::ping_check(int peer) } if (acked_ping == newest_ping) { - if (!send_peer_ping(peer, &now)) return; + dout(20) << "peer " << peer + << " has not acked the newest ping" + << " .. sending another ping" << dendl; + if (!send_peer_ping(peer, &now)) { + dout(20) << "send_peer_ping failed ..." + << " no need to schedule " << __func__ << dendl; + return; + } } + dout(30) << "schedule " << __func__ << " against peer: "<< peer << dendl; mon->timer.add_event_after(ping_timeout / PING_DIVISOR, new C_MonContext{mon, [this, peer](int) { ping_check(peer); @@ -533,11 +571,13 @@ void Elector::begin_dead_ping(int peer) { dout(20) << __func__ << " to peer " << peer << dendl; if (dead_pinging.count(peer)) { + dout(20) << peer << " already in dead_pinging ... return" << dendl; return; } live_pinging.erase(peer); dead_pinging.insert(peer); + dout(30) << "schedule dead_ping against peer: " << peer << dendl; mon->timer.add_event_after(ping_timeout, new C_MonContext{mon, [this, peer](int) { dead_ping(peer); @@ -558,6 +598,7 @@ void Elector::dead_ping(int peer) peer_tracker.report_dead_connection(peer, now - acked_ping); acked_ping = now; + dout(30) << "schedule " << __func__ << " against peer: " << peer << dendl; mon->timer.add_event_after(ping_timeout, new C_MonContext{mon, [this, peer](int) { dead_ping(peer); @@ -574,13 +615,15 @@ void Elector::handle_ping(MonOpRequestRef op) switch(m->op) { case MMonPing::PING: { + dout(30) << "recieved PING from " + << prank << ", sending PING_REPLY back!" << dendl; MMonPing *reply = new MMonPing(MMonPing::PING_REPLY, m->stamp, peer_tracker.get_encoded_bl()); m->get_connection()->send_message(reply); } break; case MMonPing::PING_REPLY: - + dout(30) << "recieved PING_REPLY from " << prank << dendl; const utime_t& previous_acked = peer_acked_ping[prank]; const utime_t& newest = peer_sent_ping[prank]; @@ -591,17 +634,23 @@ void Elector::handle_ping(MonOpRequestRef op) } if (m->stamp > previous_acked) { - dout(20) << "m->stamp > previous_acked" << dendl; + dout(30) << "recieved good PING_REPLY!" << dendl; peer_tracker.report_live_connection(prank, m->stamp - previous_acked); peer_acked_ping[prank] = m->stamp; - } else{ - dout(20) << "m->stamp <= previous_acked .. we don't report_live_connection" << dendl; + } else { + dout(30) << "recieved bad PING_REPLY! it's the same or older " + << "than the most recent ack we got." << dendl; } utime_t now = ceph_clock_now(); - dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: " - << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl; if (now - m->stamp > ping_timeout / PING_DIVISOR) { - if (!send_peer_ping(prank, &now)) return; + dout(30) << "peer " << prank << " has not acked a ping in " + << now - m->stamp << " seconds, which is more than the " + << ping_timeout / PING_DIVISOR << " seconds limit." + << " Sending another ping ..." << dendl; + if (!send_peer_ping(prank, &now)) { + dout(10) << "send_peer_ping failed ..." << dendl; + return; + } } break; } diff --git a/src/mon/Elector.h b/src/mon/Elector.h index faeb2c58c0e2e..e4a11b90190dc 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -39,6 +39,10 @@ class Elector : public ElectionOwner, RankProvider { * @defgroup Elector_h_class Elector * @{ */ + + private: + std::set pending_pings; // Monitors waiting for quorum features to be established + ElectionLogic logic; // connectivity validation and scoring ConnectionTracker peer_tracker; @@ -406,6 +410,11 @@ class Elector : public ElectionOwner, RankProvider { disallowed_leaders = dl; return true; } + /** + * process all pending pings when quorum is established + * + */ + void process_pending_pings(); void dump_connection_scores(Formatter *f) { f->open_object_section("connection scores"); peer_tracker.dump(f); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 8804c50a3dabd..c138b2a2b51d2 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -2290,7 +2290,7 @@ void Monitor::win_election(epoch_t epoch, const set& active, uint64_t featu encode(m, bl); t->put(MONITOR_STORE_PREFIX, "last_metadata", bl); } - + elector.process_pending_pings(); finish_election(); if (monmap->size() > 1 && monmap->get_epoch() > 0) { @@ -2343,7 +2343,7 @@ void Monitor::lose_election(epoch_t epoch, set &q, int l, _finish_svc_election(); logger->inc(l_mon_election_lose); - + elector.process_pending_pings(); finish_election(); } -- 2.39.5