From ed7f4e8829eed3f72db520a6bb8817d5be51b741 Mon Sep 17 00:00:00 2001
From: Kamoltat <ksirivad@redhat.com>
Date: Wed, 17 Jul 2024 22:26:55 +0000
Subject: [PATCH] qa: Added mon connection score tests

Basically when we deploy a 3 MONS

Check if the connection scores are clean
with a 60 seconds grace period

Fixes: https://tracker.ceph.com/issues/65695

Signed-off-by: Kamoltat <ksirivad@redhat.com>
---
 .../singleton/all/mon-connection-score.yaml   | 40 ++++++++
 qa/tasks/ceph_test_case.py                    | 11 +--
 qa/tasks/mon_connection_score.py              | 95 +++++++++++++++++++
 3 files changed, 139 insertions(+), 7 deletions(-)
 create mode 100644 qa/suites/rados/singleton/all/mon-connection-score.yaml
 create mode 100644 qa/tasks/mon_connection_score.py

diff --git a/qa/suites/rados/singleton/all/mon-connection-score.yaml b/qa/suites/rados/singleton/all/mon-connection-score.yaml
new file mode 100644
index 0000000000000..f9e0ba3452dd8
--- /dev/null
+++ b/qa/suites/rados/singleton/all/mon-connection-score.yaml
@@ -0,0 +1,40 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+  - client.0
+
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 10 # GB
+tasks:
+- install:
+- ceph:
+    pre-mgr-commands:
+      - sudo ceph config set mgr mgr_pool false --force
+    log-ignorelist:
+      - overall HEALTH_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(OBJECT_
+      - \(SLOW_OPS\)
+      - \(REQUEST_SLOW\)
+      - \(TOO_FEW_PGS\)
+      - slow request
+      - \(POOL_APP_NOT_ENABLED\)
+      - overall HEALTH_
+      - \(MGR_DOWN\)
+      - \(MON_DOWN\)
+      - \(PG_AVAILABILITY\)
+      - \(SLOW_OPS\)
+- cephfs_test_runner:
+    modules:
+      - tasks.mon_connection_score
\ No newline at end of file
diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py
index 8347b89062949..7afcbc2f2eb52 100644
--- a/qa/tasks/ceph_test_case.py
+++ b/qa/tasks/ceph_test_case.py
@@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd):
         while True:
             if condition():
                 success_time_elapsed = 0
-                while success_time_elapsed < success_hold_time:
-                    if condition():
-                        success_time_elapsed += 1
-                        time.sleep(1)
-                        elapsed += 1
-                    else:
-                        break
+                while success_time_elapsed < success_hold_time and condition():
+                    success_time_elapsed += 1
+                    time.sleep(1)
+                    elapsed += 1
                 if success_time_elapsed == success_hold_time:
                     log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
                     return
diff --git a/qa/tasks/mon_connection_score.py b/qa/tasks/mon_connection_score.py
new file mode 100644
index 0000000000000..3d1fdb2a736b0
--- /dev/null
+++ b/qa/tasks/mon_connection_score.py
@@ -0,0 +1,95 @@
+from tasks.ceph_test_case import CephTestCase
+import json
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestStretchClusterNew(CephTestCase):
+
+    CLUSTER = "ceph"
+    MONS = {
+            "a": {
+                "rank": 0,
+                },
+            "b": {
+                "rank": 1,
+            },
+            "c": {
+                "rank": 2,
+            }
+        }
+    WRITE_PERIOD = 10
+    RECOVERY_PERIOD = WRITE_PERIOD * 6
+    SUCCESS_HOLD_TIME = 10
+
+    def setUp(self):
+        """
+        Set up the cluster for the test.
+        """
+        super(TestStretchClusterNew, self).setUp()
+
+    def tearDown(self):
+        """
+        Clean up the cluter after the test.
+        """
+        super(TestStretchClusterNew, self).tearDown()
+
+    def _check_connection_score(self):
+        """
+        Check the connection score of all the mons.
+        """
+        for mon, _ in self.MONS.items():
+            # get the connection score
+            cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
+                'daemon', 'mon.{}'.format(mon),
+                'connection', 'scores', 'dump')
+            # parse the connection score
+            cscore = json.loads(cscore)
+            # check if the current mon rank is correct
+            if cscore["rank"] != self.MONS[mon]["rank"]:
+                log.error(
+                    "Rank mismatch {} != {}".format(
+                        cscore["rank"], self.MONS[mon]["rank"]
+                    )
+                )
+                return False
+            # check if current mon have all the peer reports and ourself
+            if len(cscore['reports']) != len(self.MONS):
+                log.error(
+                    "Reports count mismatch {}".format(cscore['reports'])
+                )
+                return False
+
+            for report in cscore["reports"]:
+                report_rank = []
+                for peer in report["peer_scores"]:
+                    # check if the peer is alive
+                    if not peer["peer_alive"]:
+                        log.error("Peer {} is not alive".format(peer))
+                        return False
+                    report_rank.append(peer["peer_rank"])
+
+                # check if current mon has all the ranks and no duplicates
+                expected_ranks = [
+                    rank
+                    for data in self.MONS.values()
+                    for rank in data.values()
+                ]
+                if report_rank.sort() != expected_ranks.sort():
+                    log.error("Rank mismatch in report {}".format(report))
+                    return False
+
+        log.info("Connection score is clean!")
+        return True
+
+    def test_connection_score(self):
+        # check if all mons are in quorum
+        self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
+        # check if all connection scores reflect this
+        self.wait_until_true_and_hold(
+            lambda: self._check_connection_score(),
+            # Wait for 4 minutes for the connection score to recover
+            timeout=self.RECOVERY_PERIOD * 4,
+            # Hold the clean connection score for 60 seconds
+            success_hold_time=self.SUCCESS_HOLD_TIME * 6
+        )
-- 
2.39.5