qa/suites/rados: white list + add MON_NETSPLIT tests

author Kamoltat Sirivadhna <ksirivad@redhat.com>

Fri, 25 Jul 2025 04:18:46 +0000 (04:18 +0000)

committer Kamoltat Sirivadhna <ksirivad@redhat.com>

Fri, 10 Oct 2025 20:36:50 +0000 (20:36 +0000)
author Kamoltat Sirivadhna <ksirivad@redhat.com>
Fri, 25 Jul 2025 04:18:46 +0000 (04:18 +0000)
committer Kamoltat Sirivadhna <ksirivad@redhat.com>
Fri, 10 Oct 2025 20:36:50 +0000 (20:36 +0000)
diff --git a/qa/suites/netsplit/ceph.yaml b/qa/suites/netsplit/ceph.yaml

index 9e90a87ee5a2806f6d832cdc97271942025ac839..5820fcefde7d9718a6c21fb9819308566a600b02 100644 (file)
--- a/qa/suites/netsplit/ceph.yaml
+++ b/qa/suites/netsplit/ceph.yaml
@@ -31,6 +31,7 @@ overrides:
        - \(PG_AVAILABILITY\)
        - \(SLOW_OPS\)
        - \[WRN\]
+      - \(MON_NETSPLIT\)
  tasks:
  - install:
  - ceph:
diff --git a/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml b/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml

index e78c95577dadeac9acaea946987ab41ddcbe5d35..3979533eed0bdbbd72703ae138357bde660258c4 100644 (file)
--- a/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml
+++ b/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml
@@ -64,6 +64,7 @@ tasks:
        - \(PG_AVAILABILITY\)
        - \(SLOW_OPS\)
        - \[WRN\]
+      - \(MON_NETSPLIT\)
  - workunit:
      clients:
        client.0:
diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml

index 69a54b0f1b772a89738ac3be98600a1feb0dca9d..146080e5133fd3ea4b281704c1251886300e9109 100644 (file)
--- a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
+++ b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
@@ -46,8 +46,6 @@ tasks:
        - \(OSD_DATACENTER_DOWN\)
        - \(OSD_DOWN\)
        - \(OSD_HOST_DOWN\)
-
-
  - workunit:
      clients:
        client.0:
diff --git a/qa/suites/rados/singleton/msgr-failures/few.yaml b/qa/suites/rados/singleton/msgr-failures/few.yaml

index 8fd638744c817b758d48e7bbe147f279b3ea1b9d..f7cc4817114e283bc4c0e36145e78fea8e2630cf 100644 (file)
--- a/qa/suites/rados/singleton/msgr-failures/few.yaml
+++ b/qa/suites/rados/singleton/msgr-failures/few.yaml
@@ -7,3 +7,4 @@ overrides:
      log-ignorelist:
        - \(OSD_SLOW_PING_TIME
        - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
diff --git a/qa/suites/rados/singleton/msgr-failures/many.yaml b/qa/suites/rados/singleton/msgr-failures/many.yaml

index 206da3ec15acf710556937543cb6cfbc5b4395b6..2c66754851a850c4cee863db7c9c99839af34709 100644 (file)
--- a/qa/suites/rados/singleton/msgr-failures/many.yaml
+++ b/qa/suites/rados/singleton/msgr-failures/many.yaml
@@ -11,3 +11,4 @@ overrides:
      log-ignorelist:
        - \(OSD_SLOW_PING_TIME
        - \(MON_DOWN\)
+      - \(MON_NETSPLIT\)
diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py

index 97fa38983a6f9b29d126c332257a87babc290957..3874076ae497e7c10653a130a520a54417c75a0b 100644 (file)
--- a/qa/tasks/mon_thrash.py
+++ b/qa/tasks/mon_thrash.py
@@ -64,7 +64,7 @@ class MonitorThrasher(Thrasher):
                          task to run with as many as just one single monitor.
                          (default: True)
      freeze_mon_probability: how often to freeze the mon instead of killing it,
-                        in % (default: 0)
+                        in % (default: 10)
      freeze_mon_duration: how many seconds to freeze the mon (default: 15)
      scrub               Scrub after each iteration (default: True)
      check_mds_failover  Check if mds failover happened (default: False)
@@ -128,6 +128,15 @@ class MonitorThrasher(Thrasher):
          self.scrub = self.config.get('scrub', True)
  
          self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+        # In some cases where many monitors froze at once and revived
+        # after a long time might cause the connection to take more time to establish.
+        # Therefore, we increase the netsplit grace period to 30 seconds.
+        # This is to avoid false positives in the netsplit test, while still keeping
+        # the integrity of the test.
+        if self.freeze_mon_probability > 0:
+            self.manager.raw_cluster_cmd(
+                'config', 'set', 'mon', 'mon_netsplit_grace_period', '30')
+
          self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
  
          assert self.max_killable() > 0, \
diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py

index a84a85bb307c1e7823bf1bd4e72bf60713b4afb2..69a0da6c7741d9858c1212538838dfdea132f7cd 100644 (file)
--- a/qa/tasks/stretch_mode_disable_enable.py
+++ b/qa/tasks/stretch_mode_disable_enable.py
@@ -122,13 +122,21 @@ class TestStretchMode(MgrTestCase):
          """
          Bring back the mon.
          """
+        log.debug("_bring_back_mon %s", mon)
+        # If the mon is already up, do nothing
+        quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+        if mon in quorum_names:
+            log.debug("mon.%s is already up", mon)
+            return
+        # If the mon is not up, try to bring it back
+        log.debug("Bringing back mon.%s", mon)
          try:
              self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
          except Exception:
              log.error("Failed to bring back mon.{}".format(str(mon)))
              pass
  
-    def _get_host(self, osd):
+    def _get_osd_host(self, osd):
          """
          Get the host of the osd.
          """
@@ -142,7 +150,7 @@ class TestStretchMode(MgrTestCase):
          """
          Move the osd back to the host.
          """
-        host = self._get_host(osd)
+        host = self._get_osd_host(osd)
          assert host is not None, "The host of osd {} is not found.".format(osd)
          log.debug("Moving osd.%d back to %s", osd, host)
          self.mgr_cluster.mon_manager.raw_cluster_cmd(
@@ -155,6 +163,7 @@ class TestStretchMode(MgrTestCase):
          Clean up the cluster after the test.
          """
          # Remove the pool
+        log.debug("Tear down the test")
          if self.POOL in self.mgr_cluster.mon_manager.pools:
              self.mgr_cluster.mon_manager.remove_pool(self.POOL)
  
@@ -168,7 +177,14 @@ class TestStretchMode(MgrTestCase):
              if osd['up'] == 0:
                  self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
                  self._move_osd_back_to_host(osd['osd'])
-        
+
+        # Set the mon_netsplit_grace_period to 30 seconds.
+        # Sometimes when many mons restart at the same time
+        # it can take longer for the monitors to establish
+        # a connection.
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mon', 'mon_netsplit_grace_period', '30'
+        )
          # Bring back all the mons
          mons = self._get_all_mons_from_all_dc()
          for mon in mons:
@@ -359,6 +375,7 @@ class TestStretchMode(MgrTestCase):
              self.TIEBREAKER_MON_NAME,
              monmap['tiebreaker_mon']
          )
+        log.debug("Stretch mode is enabled correctly.")
  
      def _stretch_mode_disabled_correctly(self):
          """
@@ -445,6 +462,7 @@ class TestStretchMode(MgrTestCase):
              "",
              monmap['tiebreaker_mon']
          )
+        log.debug("Stretch mode is disabled correctly.")
  
      def test_disable_stretch_mode(self):
          """
diff --git a/qa/tasks/test_netsplit.py b/qa/tasks/test_netsplit.py

index a16adc7eaac1969b8d5ded5247288a70049e855e..6ee6b30a87e3de002ae38fabc8814f7b4f721408 100755 (executable)
--- a/qa/tasks/test_netsplit.py
+++ b/qa/tasks/test_netsplit.py
@@ -193,6 +193,50 @@ class TestNetSplit(CephTestCase):
          except Exception:
              return False
  
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
      def test_netsplit_dc1_dc2(self):
          """
          Test Netsplit between dc1 and dc2
@@ -220,6 +264,13 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_disconnect(config),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # check the cluster is accessible
          self.wait_until_true_and_hold(
              lambda: self._reply_to_mon_command(),
@@ -263,6 +314,12 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_connect(config),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if no MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # check if all the PGs are active+clean
          self.wait_until_true_and_hold(
              lambda: self._pg_all_active_clean(),
@@ -305,6 +362,13 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_disconnect(dc1_dc2),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if the MON_NETSPLIT warning is raised we expect none
+        # because this is stretch mode
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # check the cluster is accessible
          self.wait_until_true_and_hold(
              lambda: self._reply_to_mon_command(),
@@ -352,6 +416,12 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_connect(dc1_dc2),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if the MON_NETSPLIT warning is not raised
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # check if all the PGs are active+clean
          self.wait_until_true_and_hold(
              lambda: self._pg_all_active_clean(),
diff --git a/qa/tasks/test_netsplit_3az_stretch_pool.py b/qa/tasks/test_netsplit_3az_stretch_pool.py

index 195eab5fe1420d99bfc27623cfa949fb11a53a04..68ae846865dcb138eaf5191024cac381b1fdc5ae 100755 (executable)
--- a/qa/tasks/test_netsplit_3az_stretch_pool.py
+++ b/qa/tasks/test_netsplit_3az_stretch_pool.py
@@ -209,6 +209,51 @@ class TestNetSplit(CephTestCase):
          except Exception:
              return False
  
+    def _check_mon_netsplit_warning(self):
+        """
+        Returns True if MON_NETSPLIT warning exists in health checks.
+        """
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        checks = j.get("checks", {})
+        log.debug("checks: {}".format(checks))
+        return "MON_NETSPLIT" in checks
+
+    def _check_mon_netsplit_warning_raised(self, detail):
+        """
+        Check if the MON_NETSPLIT warning with the given detail is raised.
+        """
+        log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+        (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+        arg = ['ceph', 'health', 'detail', '--format=json']
+        proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+        if proc.exitstatus != 0:
+            log.error("ceph health detail failed")
+            raise Exception("ceph health detail failed")
+        out = proc.stdout.getvalue()
+        j = json.loads(out)
+        # Access health checks
+        checks = j.get("checks", {})
+        netsplit = checks.get("MON_NETSPLIT", {})
+        if not netsplit:
+            log.info("MON_NETSPLIT not found in health checks")
+            return False
+
+        # Check if the expected detail is present
+        for d in netsplit.get("detail", []):
+            if detail in d.get("message", ""):
+                log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+                return True
+
+        log.info("MON_NETSPLIT found but detail does not match")
+        return False
+
      def test_mon_netsplit(self):
          """
          Test the mon netsplit scenario, if cluster is actually accessible.
@@ -247,6 +292,7 @@ class TestNetSplit(CephTestCase):
          # Scenario 1: disconnect Site 1 and Site 2
          # Site 3 is still connected to both Site 1 and Site 2
          config = ["mon.a", "mon.d"]
+        location = ["dc1", "dc2"]
          # disconnect the mons
          self._disconnect_mons(config)
          # wait for the mons to be disconnected
@@ -256,6 +302,16 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_disconnect(config),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if location level MON_NETSPLIT warning is raised
+        self.wait_until_true_and_hold(
+            lambda: self._check_mon_netsplit_warning_raised(
+                "Netsplit detected between {} and {}".format(
+                    location[0], location[1]
+                ),
+            ),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # check the cluster is accessible
          self.wait_until_true_and_hold(
              lambda: self._reply_to_mon_command(),
@@ -271,6 +327,12 @@ class TestNetSplit(CephTestCase):
              lambda: self._check_if_connect(config),
              timeout=self.RECOVERY_PERIOD,
          )
+        # check if the MON_NETSPLIT warning is cleared
+        self.wait_until_true_and_hold(
+            lambda: not self._check_mon_netsplit_warning(),
+            timeout=self.RECOVERY_PERIOD,
+            success_hold_time=self.SUCCESS_HOLD_TIME
+        )
          # wait for the PGs to recover
          time.sleep(self.RECOVERY_PERIOD)
          # check if all PGs are active+clean
author	Kamoltat Sirivadhna <ksirivad@redhat.com>
	Fri, 25 Jul 2025 04:18:46 +0000 (04:18 +0000)
committer	Kamoltat Sirivadhna <ksirivad@redhat.com>
	Fri, 10 Oct 2025 20:36:50 +0000 (20:36 +0000)
qa/suites/netsplit/ceph.yaml		patch \| blob \| history
qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml		patch \| blob \| history
qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml		patch \| blob \| history
qa/suites/rados/singleton/msgr-failures/few.yaml		patch \| blob \| history
qa/suites/rados/singleton/msgr-failures/many.yaml		patch \| blob \| history
qa/tasks/mon_thrash.py		patch \| blob \| history
qa/tasks/stretch_mode_disable_enable.py		patch \| blob \| history
qa/tasks/test_netsplit.py		patch \| blob \| history
qa/tasks/test_netsplit_3az_stretch_pool.py		patch \| blob \| history