From ed7f4e8829eed3f72db520a6bb8817d5be51b741 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Wed, 17 Jul 2024 22:26:55 +0000 Subject: [PATCH] qa: Added mon connection score tests Basically when we deploy a 3 MONS Check if the connection scores are clean with a 60 seconds grace period Fixes: https://tracker.ceph.com/issues/65695 Signed-off-by: Kamoltat --- .../singleton/all/mon-connection-score.yaml | 40 ++++++++ qa/tasks/ceph_test_case.py | 11 +-- qa/tasks/mon_connection_score.py | 95 +++++++++++++++++++ 3 files changed, 139 insertions(+), 7 deletions(-) create mode 100644 qa/suites/rados/singleton/all/mon-connection-score.yaml create mode 100644 qa/tasks/mon_connection_score.py diff --git a/qa/suites/rados/singleton/all/mon-connection-score.yaml b/qa/suites/rados/singleton/all/mon-connection-score.yaml new file mode 100644 index 0000000000000..f9e0ba3452dd8 --- /dev/null +++ b/qa/suites/rados/singleton/all/mon-connection-score.yaml @@ -0,0 +1,40 @@ +roles: +- - mon.a + - mon.b + - mon.c + - osd.0 + - osd.1 + - osd.2 + - mgr.x + - client.0 + +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- ceph: + pre-mgr-commands: + - sudo ceph config set mgr mgr_pool false --force + log-ignorelist: + - overall HEALTH_ + - \(OSDMAP_FLAGS\) + - \(OSD_ + - \(PG_ + - \(POOL_ + - \(CACHE_POOL_ + - \(OBJECT_ + - \(SLOW_OPS\) + - \(REQUEST_SLOW\) + - \(TOO_FEW_PGS\) + - slow request + - \(POOL_APP_NOT_ENABLED\) + - overall HEALTH_ + - \(MGR_DOWN\) + - \(MON_DOWN\) + - \(PG_AVAILABILITY\) + - \(SLOW_OPS\) +- cephfs_test_runner: + modules: + - tasks.mon_connection_score \ No newline at end of file diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py index 8347b89062949..7afcbc2f2eb52 100644 --- a/qa/tasks/ceph_test_case.py +++ b/qa/tasks/ceph_test_case.py @@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd): while True: if condition(): success_time_elapsed = 0 - while success_time_elapsed < success_hold_time: - if condition(): - success_time_elapsed += 1 - time.sleep(1) - elapsed += 1 - else: - break + while success_time_elapsed < success_hold_time and condition(): + success_time_elapsed += 1 + time.sleep(1) + elapsed += 1 if success_time_elapsed == success_hold_time: log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time)) return diff --git a/qa/tasks/mon_connection_score.py b/qa/tasks/mon_connection_score.py new file mode 100644 index 0000000000000..3d1fdb2a736b0 --- /dev/null +++ b/qa/tasks/mon_connection_score.py @@ -0,0 +1,95 @@ +from tasks.ceph_test_case import CephTestCase +import json +import logging +log = logging.getLogger(__name__) + + +class TestStretchClusterNew(CephTestCase): + + CLUSTER = "ceph" + MONS = { + "a": { + "rank": 0, + }, + "b": { + "rank": 1, + }, + "c": { + "rank": 2, + } + } + WRITE_PERIOD = 10 + RECOVERY_PERIOD = WRITE_PERIOD * 6 + SUCCESS_HOLD_TIME = 10 + + def setUp(self): + """ + Set up the cluster for the test. + """ + super(TestStretchClusterNew, self).setUp() + + def tearDown(self): + """ + Clean up the cluter after the test. + """ + super(TestStretchClusterNew, self).tearDown() + + def _check_connection_score(self): + """ + Check the connection score of all the mons. + """ + for mon, _ in self.MONS.items(): + # get the connection score + cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd( + 'daemon', 'mon.{}'.format(mon), + 'connection', 'scores', 'dump') + # parse the connection score + cscore = json.loads(cscore) + # check if the current mon rank is correct + if cscore["rank"] != self.MONS[mon]["rank"]: + log.error( + "Rank mismatch {} != {}".format( + cscore["rank"], self.MONS[mon]["rank"] + ) + ) + return False + # check if current mon have all the peer reports and ourself + if len(cscore['reports']) != len(self.MONS): + log.error( + "Reports count mismatch {}".format(cscore['reports']) + ) + return False + + for report in cscore["reports"]: + report_rank = [] + for peer in report["peer_scores"]: + # check if the peer is alive + if not peer["peer_alive"]: + log.error("Peer {} is not alive".format(peer)) + return False + report_rank.append(peer["peer_rank"]) + + # check if current mon has all the ranks and no duplicates + expected_ranks = [ + rank + for data in self.MONS.values() + for rank in data.values() + ] + if report_rank.sort() != expected_ranks.sort(): + log.error("Rank mismatch in report {}".format(report)) + return False + + log.info("Connection score is clean!") + return True + + def test_connection_score(self): + # check if all mons are in quorum + self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3) + # check if all connection scores reflect this + self.wait_until_true_and_hold( + lambda: self._check_connection_score(), + # Wait for 4 minutes for the connection score to recover + timeout=self.RECOVERY_PERIOD * 4, + # Hold the clean connection score for 60 seconds + success_hold_time=self.SUCCESS_HOLD_TIME * 6 + ) -- 2.39.5