From: Kamoltat Sirivadhna Date: Fri, 25 Jul 2025 04:18:46 +0000 (+0000) Subject: qa/suites/rados: white list + add MON_NETSPLIT tests X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=abfa5a367736e682cfcba35d7ac7fc9bbd52627d;p=ceph.git qa/suites/rados: white list + add MON_NETSPLIT tests Some existing netsplit test in 3az + stretch mode needed white listing and check for netsplit details Make qa/tasks/mon_thrash.py set mon_netsplit_grace_period to 30 seconds when we try to freeze monitors instead of killing them. Make qa/tasks/stretch_mode_disable_enable.py set mon_netsplit_grace_period to 30 seconds during `teardown` phase only. Fixes: https://tracker.ceph.com/issues/71344 Signed-off-by: Kamoltat Sirivadhna --- diff --git a/qa/suites/netsplit/ceph.yaml b/qa/suites/netsplit/ceph.yaml index 9e90a87ee5a..5820fcefde7 100644 --- a/qa/suites/netsplit/ceph.yaml +++ b/qa/suites/netsplit/ceph.yaml @@ -31,6 +31,7 @@ overrides: - \(PG_AVAILABILITY\) - \(SLOW_OPS\) - \[WRN\] + - \(MON_NETSPLIT\) tasks: - install: - ceph: diff --git a/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml b/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml index e78c95577da..3979533eed0 100644 --- a/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml +++ b/qa/suites/rados/singleton/all/3-az-stretch-cluster-netsplit.yaml @@ -64,6 +64,7 @@ tasks: - \(PG_AVAILABILITY\) - \(SLOW_OPS\) - \[WRN\] + - \(MON_NETSPLIT\) - workunit: clients: client.0: diff --git a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml index 69a54b0f1b7..146080e5133 100644 --- a/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml +++ b/qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml @@ -46,8 +46,6 @@ tasks: - \(OSD_DATACENTER_DOWN\) - \(OSD_DOWN\) - \(OSD_HOST_DOWN\) - - - workunit: clients: client.0: diff --git a/qa/suites/rados/singleton/msgr-failures/few.yaml b/qa/suites/rados/singleton/msgr-failures/few.yaml index 8fd638744c8..f7cc4817114 100644 --- a/qa/suites/rados/singleton/msgr-failures/few.yaml +++ b/qa/suites/rados/singleton/msgr-failures/few.yaml @@ -7,3 +7,4 @@ overrides: log-ignorelist: - \(OSD_SLOW_PING_TIME - \(MON_DOWN\) + - \(MON_NETSPLIT\) diff --git a/qa/suites/rados/singleton/msgr-failures/many.yaml b/qa/suites/rados/singleton/msgr-failures/many.yaml index 206da3ec15a..2c66754851a 100644 --- a/qa/suites/rados/singleton/msgr-failures/many.yaml +++ b/qa/suites/rados/singleton/msgr-failures/many.yaml @@ -11,3 +11,4 @@ overrides: log-ignorelist: - \(OSD_SLOW_PING_TIME - \(MON_DOWN\) + - \(MON_NETSPLIT\) diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py index 97fa38983a6..3874076ae49 100644 --- a/qa/tasks/mon_thrash.py +++ b/qa/tasks/mon_thrash.py @@ -64,7 +64,7 @@ class MonitorThrasher(Thrasher): task to run with as many as just one single monitor. (default: True) freeze_mon_probability: how often to freeze the mon instead of killing it, - in % (default: 0) + in % (default: 10) freeze_mon_duration: how many seconds to freeze the mon (default: 15) scrub Scrub after each iteration (default: True) check_mds_failover Check if mds failover happened (default: False) @@ -128,6 +128,15 @@ class MonitorThrasher(Thrasher): self.scrub = self.config.get('scrub', True) self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10)) + # In some cases where many monitors froze at once and revived + # after a long time might cause the connection to take more time to establish. + # Therefore, we increase the netsplit grace period to 30 seconds. + # This is to avoid false positives in the netsplit test, while still keeping + # the integrity of the test. + if self.freeze_mon_probability > 0: + self.manager.raw_cluster_cmd( + 'config', 'set', 'mon', 'mon_netsplit_grace_period', '30') + self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0)) assert self.max_killable() > 0, \ diff --git a/qa/tasks/stretch_mode_disable_enable.py b/qa/tasks/stretch_mode_disable_enable.py index a84a85bb307..69a0da6c774 100644 --- a/qa/tasks/stretch_mode_disable_enable.py +++ b/qa/tasks/stretch_mode_disable_enable.py @@ -122,13 +122,21 @@ class TestStretchMode(MgrTestCase): """ Bring back the mon. """ + log.debug("_bring_back_mon %s", mon) + # If the mon is already up, do nothing + quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names() + if mon in quorum_names: + log.debug("mon.%s is already up", mon) + return + # If the mon is not up, try to bring it back + log.debug("Bringing back mon.%s", mon) try: self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart() except Exception: log.error("Failed to bring back mon.{}".format(str(mon))) pass - def _get_host(self, osd): + def _get_osd_host(self, osd): """ Get the host of the osd. """ @@ -142,7 +150,7 @@ class TestStretchMode(MgrTestCase): """ Move the osd back to the host. """ - host = self._get_host(osd) + host = self._get_osd_host(osd) assert host is not None, "The host of osd {} is not found.".format(osd) log.debug("Moving osd.%d back to %s", osd, host) self.mgr_cluster.mon_manager.raw_cluster_cmd( @@ -155,6 +163,7 @@ class TestStretchMode(MgrTestCase): Clean up the cluster after the test. """ # Remove the pool + log.debug("Tear down the test") if self.POOL in self.mgr_cluster.mon_manager.pools: self.mgr_cluster.mon_manager.remove_pool(self.POOL) @@ -168,7 +177,14 @@ class TestStretchMode(MgrTestCase): if osd['up'] == 0: self.mgr_cluster.mon_manager.revive_osd(osd['osd']) self._move_osd_back_to_host(osd['osd']) - + + # Set the mon_netsplit_grace_period to 30 seconds. + # Sometimes when many mons restart at the same time + # it can take longer for the monitors to establish + # a connection. + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mon', 'mon_netsplit_grace_period', '30' + ) # Bring back all the mons mons = self._get_all_mons_from_all_dc() for mon in mons: @@ -359,6 +375,7 @@ class TestStretchMode(MgrTestCase): self.TIEBREAKER_MON_NAME, monmap['tiebreaker_mon'] ) + log.debug("Stretch mode is enabled correctly.") def _stretch_mode_disabled_correctly(self): """ @@ -445,6 +462,7 @@ class TestStretchMode(MgrTestCase): "", monmap['tiebreaker_mon'] ) + log.debug("Stretch mode is disabled correctly.") def test_disable_stretch_mode(self): """ diff --git a/qa/tasks/test_netsplit.py b/qa/tasks/test_netsplit.py index a16adc7eaac..6ee6b30a87e 100755 --- a/qa/tasks/test_netsplit.py +++ b/qa/tasks/test_netsplit.py @@ -193,6 +193,50 @@ class TestNetSplit(CephTestCase): except Exception: return False + def _check_mon_netsplit_warning(self): + """ + Returns True if MON_NETSPLIT warning exists in health checks. + """ + (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys() + arg = ['ceph', 'health', 'detail', '--format=json'] + proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30) + if proc.exitstatus != 0: + log.error("ceph health detail failed") + raise Exception("ceph health detail failed") + out = proc.stdout.getvalue() + j = json.loads(out) + checks = j.get("checks", {}) + return "MON_NETSPLIT" in checks + + def _check_mon_netsplit_warning_raised(self, detail): + """ + Check if the MON_NETSPLIT warning with the given detail is raised. + """ + log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail)) + (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys() + arg = ['ceph', 'health', 'detail', '--format=json'] + proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30) + if proc.exitstatus != 0: + log.error("ceph health detail failed") + raise Exception("ceph health detail failed") + out = proc.stdout.getvalue() + j = json.loads(out) + # Access health checks + checks = j.get("checks", {}) + netsplit = checks.get("MON_NETSPLIT", {}) + if not netsplit: + log.info("MON_NETSPLIT not found in health checks") + return False + + # Check if the expected detail is present + for d in netsplit.get("detail", []): + if detail in d.get("message", ""): + log.info("Found MON_NETSPLIT warning with detail: {}".format(d)) + return True + + log.info("MON_NETSPLIT found but detail does not match") + return False + def test_netsplit_dc1_dc2(self): """ Test Netsplit between dc1 and dc2 @@ -220,6 +264,13 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_disconnect(config), timeout=self.RECOVERY_PERIOD, ) + # check if the MON_NETSPLIT warning is raised we expect none + # because this is stretch mode + self.wait_until_true_and_hold( + lambda: not self._check_mon_netsplit_warning(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # check the cluster is accessible self.wait_until_true_and_hold( lambda: self._reply_to_mon_command(), @@ -263,6 +314,12 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_connect(config), timeout=self.RECOVERY_PERIOD, ) + # check if no MON_NETSPLIT warning is raised + self.wait_until_true_and_hold( + lambda: not self._check_mon_netsplit_warning(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # check if all the PGs are active+clean self.wait_until_true_and_hold( lambda: self._pg_all_active_clean(), @@ -305,6 +362,13 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_disconnect(dc1_dc2), timeout=self.RECOVERY_PERIOD, ) + # check if the MON_NETSPLIT warning is raised we expect none + # because this is stretch mode + self.wait_until_true_and_hold( + lambda: not self._check_mon_netsplit_warning(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # check the cluster is accessible self.wait_until_true_and_hold( lambda: self._reply_to_mon_command(), @@ -352,6 +416,12 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_connect(dc1_dc2), timeout=self.RECOVERY_PERIOD, ) + # check if the MON_NETSPLIT warning is not raised + self.wait_until_true_and_hold( + lambda: not self._check_mon_netsplit_warning(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # check if all the PGs are active+clean self.wait_until_true_and_hold( lambda: self._pg_all_active_clean(), diff --git a/qa/tasks/test_netsplit_3az_stretch_pool.py b/qa/tasks/test_netsplit_3az_stretch_pool.py index 195eab5fe14..68ae846865d 100755 --- a/qa/tasks/test_netsplit_3az_stretch_pool.py +++ b/qa/tasks/test_netsplit_3az_stretch_pool.py @@ -209,6 +209,51 @@ class TestNetSplit(CephTestCase): except Exception: return False + def _check_mon_netsplit_warning(self): + """ + Returns True if MON_NETSPLIT warning exists in health checks. + """ + (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys() + arg = ['ceph', 'health', 'detail', '--format=json'] + proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30) + if proc.exitstatus != 0: + log.error("ceph health detail failed") + raise Exception("ceph health detail failed") + out = proc.stdout.getvalue() + j = json.loads(out) + checks = j.get("checks", {}) + log.debug("checks: {}".format(checks)) + return "MON_NETSPLIT" in checks + + def _check_mon_netsplit_warning_raised(self, detail): + """ + Check if the MON_NETSPLIT warning with the given detail is raised. + """ + log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail)) + (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys() + arg = ['ceph', 'health', 'detail', '--format=json'] + proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30) + if proc.exitstatus != 0: + log.error("ceph health detail failed") + raise Exception("ceph health detail failed") + out = proc.stdout.getvalue() + j = json.loads(out) + # Access health checks + checks = j.get("checks", {}) + netsplit = checks.get("MON_NETSPLIT", {}) + if not netsplit: + log.info("MON_NETSPLIT not found in health checks") + return False + + # Check if the expected detail is present + for d in netsplit.get("detail", []): + if detail in d.get("message", ""): + log.info("Found MON_NETSPLIT warning with detail: {}".format(d)) + return True + + log.info("MON_NETSPLIT found but detail does not match") + return False + def test_mon_netsplit(self): """ Test the mon netsplit scenario, if cluster is actually accessible. @@ -247,6 +292,7 @@ class TestNetSplit(CephTestCase): # Scenario 1: disconnect Site 1 and Site 2 # Site 3 is still connected to both Site 1 and Site 2 config = ["mon.a", "mon.d"] + location = ["dc1", "dc2"] # disconnect the mons self._disconnect_mons(config) # wait for the mons to be disconnected @@ -256,6 +302,16 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_disconnect(config), timeout=self.RECOVERY_PERIOD, ) + # check if location level MON_NETSPLIT warning is raised + self.wait_until_true_and_hold( + lambda: self._check_mon_netsplit_warning_raised( + "Netsplit detected between {} and {}".format( + location[0], location[1] + ), + ), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # check the cluster is accessible self.wait_until_true_and_hold( lambda: self._reply_to_mon_command(), @@ -271,6 +327,12 @@ class TestNetSplit(CephTestCase): lambda: self._check_if_connect(config), timeout=self.RECOVERY_PERIOD, ) + # check if the MON_NETSPLIT warning is cleared + self.wait_until_true_and_hold( + lambda: not self._check_mon_netsplit_warning(), + timeout=self.RECOVERY_PERIOD, + success_hold_time=self.SUCCESS_HOLD_TIME + ) # wait for the PGs to recover time.sleep(self.RECOVERY_PERIOD) # check if all PGs are active+clean