From 7572521a02f3cfe31b407d5bb0c599cb65f74908 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Wed, 2 Nov 2022 01:46:14 +0000 Subject: [PATCH] mon/ConnectionTracker.cc: Improve notify_rank_removed() PROBLEM: In `ConnectionTracker::receive_peer_report` we loop through ranks which is bad when there is `notify_rank_removed` before this and the ranks are not adjusted yet. When we rely on the rank in certain scenarios, we end up with extra peer_report copy which we don't want. SOLUTION: In `ConnectionTracker::receive_peer_report` instead of passing `report.rank` in the function `ConnectionTracker::reports`, we pass `i.first` instead so that trim old ranks properly. We also added a assert in notify_rank_removed(), comparing expected rank provided by the monmap against the rank that we adjust ourself to as a sanity check. We edited test/mon/test_election.cc to reflect the changes made in notify_rank_removed(). Fixes: https://tracker.ceph.com/issues/58049 Signed-off-by: Kamoltat (cherry picked from commit 7c52ccec76bc7e7f9678cc9d78d106e17f9ad8f7) Conflicts: src/mon/Elector.cc - trivial fix src/mon/Elector.h - trivial fix --- src/mon/ConnectionTracker.cc | 47 ++++++++++++++++++----------------- src/mon/ConnectionTracker.h | 2 +- src/mon/Elector.cc | 5 ++-- src/mon/Elector.h | 2 +- src/test/mon/test_election.cc | 12 ++++++--- 5 files changed, 37 insertions(+), 31 deletions(-) diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc index 0df2f32995e7f..022522aa97014 100644 --- a/src/mon/ConnectionTracker.cc +++ b/src/mon/ConnectionTracker.cc @@ -62,8 +62,8 @@ void ConnectionTracker::receive_peer_report(const ConnectionTracker& o) ldout(cct, 30) << __func__ << dendl; for (auto& i : o.peer_reports) { const ConnectionReport& report = i.second; - if (report.rank == rank) continue; - ConnectionReport& existing = *reports(report.rank); + if (i.first == rank) continue; + ConnectionReport& existing = *reports(i.first); if (report.epoch > existing.epoch || (report.epoch == existing.epoch && report.epoch_version > existing.epoch_version)) { @@ -188,28 +188,23 @@ void ConnectionTracker::notify_rank_changed(int new_rank) rank = new_rank; encoding.clear(); ldout(cct, 20) << "peer_reports after: " << peer_reports << dendl; + + increase_version(); } -void ConnectionTracker::notify_rank_removed(int rank_removed) +void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) { - - ldout(cct, 20) << __func__ << " " << rank_removed << dendl; - - // No point removing something that doesn't exist! - if (!peer_reports.count(rank_removed)) return; - + ldout(cct, 20) << __func__ << " " << rank_removed + << " new_rank: " << new_rank << dendl; ldout(cct, 20) << "my_reports before: " << my_reports << dendl; ldout(cct, 20) << "peer_reports before: " << peer_reports << dendl; ldout(cct, 20) << "my rank before: " << rank << dendl; encoding.clear(); - size_t starting_size = my_reports.current.size(); - // erase the removed rank from everywhere + size_t starting_size_current = my_reports.current.size(); + // Lets adjust everything in my report. my_reports.current.erase(rank_removed); my_reports.history.erase(rank_removed); - peer_reports.erase(rank_removed); - // Move ranks > rank_removed down by 1 - // First in my_reports' history+current auto ci = my_reports.current.upper_bound(rank_removed); auto hi = my_reports.history.upper_bound(rank_removed); while (ci != my_reports.current.end()) { @@ -219,22 +214,26 @@ void ConnectionTracker::notify_rank_removed(int rank_removed) my_reports.current.erase(ci++); my_reports.history.erase(hi++); } - ceph_assert((my_reports.current.size() == starting_size) || - (my_reports.current.size() + 1 == starting_size)); + ceph_assert((my_reports.current.size() == starting_size_current) || + (my_reports.current.size() + 1 == starting_size_current)); - // now move ranks down one in peer_reports - starting_size = peer_reports.size(); + size_t starting_size = peer_reports.size(); auto pi = peer_reports.upper_bound(rank_removed); + // Remove the target rank and adjust everything that comes after. + // Note that we don't adjust current and history for our peer_reports + // because it is better to rely on our peers on that information. + peer_reports.erase(rank_removed); while (pi != peer_reports.end()) { - peer_reports[pi->first - 1] = pi->second; - peer_reports.erase(pi++); + peer_reports[pi->first - 1] = pi->second; // copy content of next rank to ourself. + peer_reports.erase(pi++); // destroy our next rank and move on. } + ceph_assert((peer_reports.size() == starting_size) || - (peer_reports.size() + 1 == starting_size)); + (peer_reports.size() + 1 == starting_size)); - if (rank_removed < rank) { + if (rank_removed < rank) { // if the rank removed is lower than us, we need to adjust. --rank; - my_reports.rank = rank; + my_reports.rank = rank; // also adjust my_reports.rank. } ldout(cct, 20) << "my rank after: " << rank << dendl; @@ -243,6 +242,8 @@ void ConnectionTracker::notify_rank_removed(int rank_removed) //check if the new_rank from monmap is equal to our adjusted rank. ceph_assert(rank == new_rank); + + increase_version(); } void ConnectionTracker::encode(bufferlist &bl) const diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h index 0cf4be3c80cfc..c248f7f692764 100644 --- a/src/mon/ConnectionTracker.h +++ b/src/mon/ConnectionTracker.h @@ -181,7 +181,7 @@ class ConnectionTracker { } void notify_reset() { clear_peer_reports(); } void notify_rank_changed(int new_rank); - void notify_rank_removed(int rank_removed); + void notify_rank_removed(int rank_removed, int new_rank); friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c); friend ConnectionReport *get_connection_reports(ConnectionTracker& ct); friend map *get_peer_reports(ConnectionTracker& ct); diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 9866080630cf1..af1109ae8d269 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -626,6 +626,7 @@ void Elector::dispatch(MonOpRequestRef op) } auto em = op->get_req(); + dout(20) << __func__ << " from: " << mon->monmap->get_rank(em->get_source_addr()) << dendl; // assume an old message encoding would have matched if (em->fsid != mon->monmap->fsid) { dout(0) << " ignoring election msg fsid " @@ -730,10 +731,10 @@ void Elector::notify_rank_changed(int new_rank) dead_pinging.erase(new_rank); } -void Elector::notify_rank_removed(int rank_removed) +void Elector::notify_rank_removed(int rank_removed, int new_rank) { dout(10) << __func__ << ": " << rank_removed << dendl; - peer_tracker.notify_rank_removed(rank_removed); + peer_tracker.notify_rank_removed(rank_removed, new_rank); /* we have to clean up the pinging state, which is annoying because it's not indexed anywhere (and adding indexing would also be annoying). diff --git a/src/mon/Elector.h b/src/mon/Elector.h index 241f7e0ebf8fb..7ea38400df558 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -371,7 +371,7 @@ class Elector : public ElectionOwner, RankProvider { * This is safe to call even if we haven't joined or are currently * in a quorum. */ - void notify_rank_removed(int rank_removed); + void notify_rank_removed(int rank_removed, int new_rank); void notify_strategy_maybe_changed(int strategy); /** * Set the disallowed leaders. diff --git a/src/test/mon/test_election.cc b/src/test/mon/test_election.cc index 5b3af11847890..a232d7355f9f5 100644 --- a/src/test/mon/test_election.cc +++ b/src/test/mon/test_election.cc @@ -124,9 +124,13 @@ struct Owner : public ElectionOwner, RankProvider { // don't need to do anything with our state right now void notify_bump_epoch() {} void notify_rank_removed(int removed_rank) { - peer_tracker.notify_rank_removed(removed_rank); - if (rank > removed_rank) + ldout(g_ceph_context, 1) << "removed_rank: " << removed_rank << dendl; + ldout(g_ceph_context, 1) << "rank before: " << rank << dendl; + if (removed_rank < rank) { --rank; + } + peer_tracker.notify_rank_removed(removed_rank, rank); + ldout(g_ceph_context, 1) << "rank after: " << rank << dendl; } void notify_deleted() { rank_deleted = true; rank = -1; cancel_timer(); } // pass back to ElectionLogic; we don't need this redirect ourselves @@ -216,7 +220,7 @@ struct Owner : public ElectionOwner, RankProvider { } } void receive_scores(bufferlist bl) { - ConnectionTracker oct(bl); + ConnectionTracker oct(bl, g_ceph_context); peer_tracker.receive_peer_report(oct); ldout(g_ceph_context, 10) << "received scores " << oct << dendl; } @@ -362,7 +366,7 @@ void Election::propose_to(int from, int to, epoch_t e, bufferlist& cbl) Owner *o = electors[to]; ConnectionTracker *oct = NULL; if (cbl.length()) { - oct = new ConnectionTracker(cbl); // we leak these on blocked cons, meh + oct = new ConnectionTracker(cbl, g_ceph_context); // we leak these on blocked cons, meh } queue_election_message(from, to, [o, from, e, oct] { o->receive_propose(from, e, oct); -- 2.39.5