From: Kamoltat Date: Fri, 11 Nov 2022 22:56:46 +0000 (+0000) Subject: mon: clear connection score during update & add sanity check live/dead connection... X-Git-Tag: v16.2.11~28^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=db95449e9c1702f75f6f064a841fac8f255ed5f8;p=ceph.git mon: clear connection score during update & add sanity check live/dead connection report When upgrading the monitors (include booting up), we check if `peer_tracker` is dirty or not. If so, we clear it. Added some functions in `Elector` and `ConnectionTracker` class to check for clean `peer_tracker`. Moreover, there could be some cases where due to startup weirdness or abnormal circumstances, we might get a report from our own rank. Therefore, it doesn't hurt to add a sanity check in `ConnectionTracker::report_live_connection` and `ConnectionTracker::report_dead_connection`. Fixes: https://tracker.ceph.com/issues/58049 Signed-off-by: Kamoltat (cherry picked from commit 25ce77c7984587f457eba9bd06e416ef06f4e1c7) --- diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc index 022522aa97014..272ad40c27461 100644 --- a/src/mon/ConnectionTracker.cc +++ b/src/mon/ConnectionTracker.cc @@ -106,6 +106,10 @@ void ConnectionTracker::report_live_connection(int peer_rank, double units_alive { ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_alive: " << units_alive << dendl; ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { @@ -130,6 +134,10 @@ void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead) { ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_dead: " << units_dead << dendl; ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { @@ -246,6 +254,22 @@ void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) increase_version(); } +bool ConnectionTracker::is_clean(int mon_rank, int monmap_size) +{ + ldout(cct, 30) << __func__ << dendl; + // check consistency between our rank according + // to monmap and our rank according to our report. + if (rank != mon_rank || + my_reports.rank != mon_rank) { + return false; + } else if (!peer_reports.empty()){ + // if peer_report max rank is greater than monmap max rank + // then there is a problem. + if (peer_reports.rbegin()->first > monmap_size - 1) return false; + } + return true; +} + void ConnectionTracker::encode(bufferlist &bl) const { ENCODE_START(1, 1, bl); diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h index a8b116efcbfb1..09506636d1e74 100644 --- a/src/mon/ConnectionTracker.h +++ b/src/mon/ConnectionTracker.h @@ -120,6 +120,13 @@ class ConnectionTracker { */ void get_total_connection_score(int peer_rank, double *rating, int *live_count) const; + /** + * Check if our ranks are clean and make + * sure there are no extra peer_report lingering. + * In the future we also want to check the reports + * current and history of each peer_report. + */ + bool is_clean(int mon_rank, int monmap_size); /** * Encode this ConnectionTracker. Useful both for storing on disk * and for sending off to peers for decoding and import @@ -185,6 +192,7 @@ class ConnectionTracker { rank = new_rank; my_reports.rank = rank; } + void notify_rank_changed(int new_rank); void notify_rank_removed(int rank_removed, int new_rank); friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c); diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index c725a38e9ba6c..671c08d85929b 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -717,6 +717,11 @@ void Elector::start_participating() logic.participating = true; } +bool Elector::peer_tracker_is_clean() +{ + return peer_tracker.is_clean(mon->rank, paxos_size()); +} + void Elector::notify_clear_peer_state() { dout(10) << __func__ << dendl; diff --git a/src/mon/Elector.h b/src/mon/Elector.h index 7ea38400df558..2a53c1fc4350e 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -357,6 +357,11 @@ class Elector : public ElectionOwner, RankProvider { * @post @p participating is true */ void start_participating(); + /** + * Check if our peer_tracker is self-consistent, not suffering from + * https://tracker.ceph.com/issues/58049 + */ + bool peer_tracker_is_clean(); /** * Forget everything about our peers. :( */ diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index c8cbe903553df..365159b496e4d 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -950,7 +950,15 @@ int Monitor::init() mgrmon()->prime_mgr_client(); state = STATE_PROBING; + bootstrap(); + + if (!elector.peer_tracker_is_clean()){ + dout(10) << "peer_tracker looks inconsistent" + << " previous bad logic, clearing ..." << dendl; + elector.notify_clear_peer_state(); + } + // add features of myself into feature_map session_map.feature_map.add_mon(con_self->get_features()); return 0;