]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/suites/rados: white list + add MON_NETSPLIT tests
authorKamoltat Sirivadhna <ksirivad@redhat.com>
Fri, 25 Jul 2025 04:18:46 +0000 (04:18 +0000)
committerKamoltat Sirivadhna <ksirivad@redhat.com>
Fri, 10 Oct 2025 20:36:50 +0000 (20:36 +0000)
Some existing netsplit test in 3az + stretch mode
needed white listing and check for netsplit details

Make qa/tasks/mon_thrash.py set
mon_netsplit_grace_period to 30 seconds
when we try to freeze monitors instead of killing them.

Make qa/tasks/stretch_mode_disable_enable.py set
mon_netsplit_grace_period to 30 seconds
during `teardown` phase only.

Fixes: https://tracker.ceph.com/issues/71344
Signed-off-by: Kamoltat Sirivadhna <ksirivad@redhat.com>
qa/suites/netsplit/ceph.yaml
qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml
qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
qa/suites/rados/singleton/msgr-failures/few.yaml
qa/suites/rados/singleton/msgr-failures/many.yaml
qa/tasks/mon_thrash.py
qa/tasks/stretch_mode_disable_enable.py
qa/tasks/test_netsplit.py
qa/tasks/test_netsplit_3az_stretch_pool.py

index 9e90a87ee5a2806f6d832cdc97271942025ac839..5820fcefde7d9718a6c21fb9819308566a600b02 100644 (file)
@@ -31,6 +31,7 @@ overrides:
       - \(PG_AVAILABILITY\)
       - \(SLOW_OPS\)
       - \[WRN\]
+      - \(MON_NETSPLIT\)
 tasks:
 - install:
 - ceph:
index e78c95577dadeac9acaea946987ab41ddcbe5d35..3979533eed0bdbbd72703ae138357bde660258c4 100644 (file)
@@ -64,6 +64,7 @@ tasks:
       - \(PG_AVAILABILITY\)
       - \(SLOW_OPS\)
       - \[WRN\]
+      - \(MON_NETSPLIT\)
 - workunit:
     clients:
       client.0:
index 69a54b0f1b772a89738ac3be98600a1feb0dca9d..146080e5133fd3ea4b281704c1251886300e9109 100644 (file)
@@ -46,8 +46,6 @@ tasks:
       - \(OSD_DATACENTER_DOWN\)
       - \(OSD_DOWN\)
       - \(OSD_HOST_DOWN\)
-
-
 - workunit:
     clients:
       client.0:
index 8fd638744c817b758d48e7bbe147f279b3ea1b9d..f7cc4817114e283bc4c0e36145e78fea8e2630cf 100644 (file)
@@ -7,3 +7,4 @@ overrides:
     log-ignorelist:
       - \(OSD_SLOW_PING_TIME
       - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
index 206da3ec15acf710556937543cb6cfbc5b4395b6..2c66754851a850c4cee863db7c9c99839af34709 100644 (file)
@@ -11,3 +11,4 @@ overrides:
     log-ignorelist:
       - \(OSD_SLOW_PING_TIME
       - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
index 97fa38983a6f9b29d126c332257a87babc290957..3874076ae497e7c10653a130a520a54417c75a0b 100644 (file)
@@ -64,7 +64,7 @@ class MonitorThrasher(Thrasher):
                         task to run with as many as just one single monitor.
                         (default: True)
     freeze_mon_probability: how often to freeze the mon instead of killing it,
-                        in % (default: 0)
+                        in % (default: 10)
     freeze_mon_duration: how many seconds to freeze the mon (default: 15)
     scrub               Scrub after each iteration (default: True)
     check_mds_failover  Check if mds failover happened (default: False)
@@ -128,6 +128,15 @@ class MonitorThrasher(Thrasher):
         self.scrub = self.config.get('scrub', True)
 
         self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+        # In some cases where many monitors froze at once and revived
+        # after a long time might cause the connection to take more time to establish.
+        # Therefore, we increase the netsplit grace period to 30 seconds.
+        # This is to avoid false positives in the netsplit test, while still keeping
+        # the integrity of the test.
+        if self.freeze_mon_probability > 0:
+            self.manager.raw_cluster_cmd(
+                'config', 'set', 'mon', 'mon_netsplit_grace_period', '30')
+
         self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
 
         assert self.max_killable() > 0, \
index a84a85bb307c1e7823bf1bd4e72bf60713b4afb2..69a0da6c7741d9858c1212538838dfdea132f7cd 100644 (file)
@@ -122,13 +122,21 @@ class TestStretchMode(MgrTestCase):
         """
         Bring back the mon.
         """
+        log.debug("_bring_back_mon %s", mon)
+        # If the mon is already up, do nothing
+        quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+        if mon in quorum_names:
+            log.debug("mon.%s is already up", mon)
+            return
+        # If the mon is not up, try to bring it back
+        log.debug("Bringing back mon.%s", mon)
         try:
             self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
         except Exception:
             log.error("Failed to bring back mon.{}".format(str(mon)))
             pass
 
-    def _get_host(self, osd):
+    def _get_osd_host(self, osd):
         """
         Get the host of the osd.
         """
@@ -142,7 +150,7 @@ class TestStretchMode(MgrTestCase):
         """
         Move the osd back to the host.
         """
-        host = self._get_host(osd)
+        host = self._get_osd_host(osd)
         assert host is not None, "The host of osd {} is not found.".format(osd)
         log.debug("Moving osd.%d back to %s", osd, host)
         self.mgr_cluster.mon_manager.raw_cluster_cmd(
@@ -155,6 +163,7 @@ class TestStretchMode(MgrTestCase):
         Clean up the cluster after the test.
         """
         # Remove the pool
+        log.debug("Tear down the test")
         if self.POOL in self.mgr_cluster.mon_manager.pools:
             self.mgr_cluster.mon_manager.remove_pool(self.POOL)
 
@@ -168,7 +177,14 @@ class TestStretchMode(MgrTestCase):
             if osd['up'] == 0:
                 self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
                 self._move_osd_back_to_host(osd['osd'])
-        
+
+        # Set the mon_netsplit_grace_period to 30 seconds.
+        # Sometimes when many mons restart at the same time
+        # it can take longer for the monitors to establish
+        # a connection.
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mon', 'mon_netsplit_grace_period', '30'
+        )
         # Bring back all the mons
         mons = self._get_all_mons_from_all_dc()
         for mon in mons:
@@ -359,6 +375,7 @@ class TestStretchMode(MgrTestCase):
             self.TIEBREAKER_MON_NAME,
             monmap['tiebreaker_mon']
         )
+        log.debug("Stretch mode is enabled correctly.")
 
     def _stretch_mode_disabled_correctly(self):
         """
@@ -445,6 +462,7 @@ class TestStretchMode(MgrTestCase):
             "",
             monmap['tiebreaker_mon']
         )
+        log.debug("Stretch mode is disabled correctly.")
 
     def test_disable_stretch_mode(self):
         """
index a16adc7eaac1969b8d5ded5247288a70049e855e..6ee6b30a87e3de002ae38fabc8814f7b4f721408 100755 (executable)
@@ -193,6 +193,50 @@ class TestNetSplit(CephTestCase):
         except Exception:
             return False
 
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
     def test_netsplit_dc1_dc2(self):
         """
         Test Netsplit between dc1 and dc2
@@ -220,6 +264,13 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_disconnect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -263,6 +314,12 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_connect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if no MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check if all the PGs are active+clean
         self.wait_until_true_and_hold(
             lambda: self._pg_all_active_clean(),
@@ -305,6 +362,13 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_disconnect(dc1_dc2),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -352,6 +416,12 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_connect(dc1_dc2),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is not raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check if all the PGs are active+clean
         self.wait_until_true_and_hold(
             lambda: self._pg_all_active_clean(),
index 195eab5fe1420d99bfc27623cfa949fb11a53a04..68ae846865dcb138eaf5191024cac381b1fdc5ae 100755 (executable)
@@ -209,6 +209,51 @@ class TestNetSplit(CephTestCase):
         except Exception:
             return False
 
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        log.debug("checks: {}".format(checks))
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
     def test_mon_netsplit(self):
         """
         Test the mon netsplit scenario, if cluster is actually accessible.
@@ -247,6 +292,7 @@ class TestNetSplit(CephTestCase):
         # Scenario 1: disconnect Site 1 and Site 2
         # Site 3 is still connected to both Site 1 and Site 2
         config = ["mon.a", "mon.d"]
+        location = ["dc1", "dc2"]
         # disconnect the mons
         self._disconnect_mons(config)
         # wait for the mons to be disconnected
@@ -256,6 +302,16 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_disconnect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if location level MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: self._check_mon_netsplit_warning_raised(
+                "Netsplit detected between {} and {}".format(
+                    location[0], location[1]
+                ),
+            ),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # check the cluster is accessible
         self.wait_until_true_and_hold(
             lambda: self._reply_to_mon_command(),
@@ -271,6 +327,12 @@ class TestNetSplit(CephTestCase):
             lambda: self._check_if_connect(config),
             timeout=self.RECOVERY_PERIOD,
         )
+        # check if the MON_NETSPLIT warning is cleared
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
         # wait for the PGs to recover
         time.sleep(self.RECOVERY_PERIOD)
         # check if all PGs are active+clean