- \(PG_AVAILABILITY\)
- \(SLOW_OPS\)
- \[WRN\]
+ - \(MON_NETSPLIT\)
tasks:
- install:
- ceph:
- \(PG_AVAILABILITY\)
- \(SLOW_OPS\)
- \[WRN\]
+ - \(MON_NETSPLIT\)
- workunit:
clients:
client.0:
- \(OSD_DATACENTER_DOWN\)
- \(OSD_DOWN\)
- \(OSD_HOST_DOWN\)
-
-
- workunit:
clients:
client.0:
log-ignorelist:
- \(OSD_SLOW_PING_TIME
- \(MON_DOWN\)
+ - \(MON_NETSPLIT\)
log-ignorelist:
- \(OSD_SLOW_PING_TIME
- \(MON_DOWN\)
+ - \(MON_NETSPLIT\)
task to run with as many as just one single monitor.
(default: True)
freeze_mon_probability: how often to freeze the mon instead of killing it,
- in % (default: 0)
+ in % (default: 10)
freeze_mon_duration: how many seconds to freeze the mon (default: 15)
scrub Scrub after each iteration (default: True)
check_mds_failover Check if mds failover happened (default: False)
self.scrub = self.config.get('scrub', True)
self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+ # In some cases where many monitors froze at once and revived
+ # after a long time might cause the connection to take more time to establish.
+ # Therefore, we increase the netsplit grace period to 30 seconds.
+ # This is to avoid false positives in the netsplit test, while still keeping
+ # the integrity of the test.
+ if self.freeze_mon_probability > 0:
+ self.manager.raw_cluster_cmd(
+ 'config', 'set', 'mon', 'mon_netsplit_grace_period', '30')
+
self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
assert self.max_killable() > 0, \
"""
Bring back the mon.
"""
+ log.debug("_bring_back_mon %s", mon)
+ # If the mon is already up, do nothing
+ quorum_names = self.mgr_cluster.mon_manager.get_mon_quorum_names()
+ if mon in quorum_names:
+ log.debug("mon.%s is already up", mon)
+ return
+ # If the mon is not up, try to bring it back
+ log.debug("Bringing back mon.%s", mon)
try:
self.ctx.daemons.get_daemon('mon', mon, self.CLUSTER).restart()
except Exception:
log.error("Failed to bring back mon.{}".format(str(mon)))
pass
- def _get_host(self, osd):
+ def _get_osd_host(self, osd):
"""
Get the host of the osd.
"""
"""
Move the osd back to the host.
"""
- host = self._get_host(osd)
+ host = self._get_osd_host(osd)
assert host is not None, "The host of osd {} is not found.".format(osd)
log.debug("Moving osd.%d back to %s", osd, host)
self.mgr_cluster.mon_manager.raw_cluster_cmd(
Clean up the cluster after the test.
"""
# Remove the pool
+ log.debug("Tear down the test")
if self.POOL in self.mgr_cluster.mon_manager.pools:
self.mgr_cluster.mon_manager.remove_pool(self.POOL)
if osd['up'] == 0:
self.mgr_cluster.mon_manager.revive_osd(osd['osd'])
self._move_osd_back_to_host(osd['osd'])
-
+
+ # Set the mon_netsplit_grace_period to 30 seconds.
+ # Sometimes when many mons restart at the same time
+ # it can take longer for the monitors to establish
+ # a connection.
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ 'config', 'set', 'mon', 'mon_netsplit_grace_period', '30'
+ )
# Bring back all the mons
mons = self._get_all_mons_from_all_dc()
for mon in mons:
self.TIEBREAKER_MON_NAME,
monmap['tiebreaker_mon']
)
+ log.debug("Stretch mode is enabled correctly.")
def _stretch_mode_disabled_correctly(self):
"""
"",
monmap['tiebreaker_mon']
)
+ log.debug("Stretch mode is disabled correctly.")
def test_disable_stretch_mode(self):
"""
except Exception:
return False
+ def _check_mon_netsplit_warning(self):
+ """
+ Returns True if MON_NETSPLIT warning exists in health checks.
+ """
+ (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+ arg = ['ceph', 'health', 'detail', '--format=json']
+ proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+ if proc.exitstatus != 0:
+ log.error("ceph health detail failed")
+ raise Exception("ceph health detail failed")
+ out = proc.stdout.getvalue()
+ j = json.loads(out)
+ checks = j.get("checks", {})
+ return "MON_NETSPLIT" in checks
+
+ def _check_mon_netsplit_warning_raised(self, detail):
+ """
+ Check if the MON_NETSPLIT warning with the given detail is raised.
+ """
+ log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+ (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+ arg = ['ceph', 'health', 'detail', '--format=json']
+ proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+ if proc.exitstatus != 0:
+ log.error("ceph health detail failed")
+ raise Exception("ceph health detail failed")
+ out = proc.stdout.getvalue()
+ j = json.loads(out)
+ # Access health checks
+ checks = j.get("checks", {})
+ netsplit = checks.get("MON_NETSPLIT", {})
+ if not netsplit:
+ log.info("MON_NETSPLIT not found in health checks")
+ return False
+
+ # Check if the expected detail is present
+ for d in netsplit.get("detail", []):
+ if detail in d.get("message", ""):
+ log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+ return True
+
+ log.info("MON_NETSPLIT found but detail does not match")
+ return False
+
def test_netsplit_dc1_dc2(self):
"""
Test Netsplit between dc1 and dc2
lambda: self._check_if_disconnect(config),
timeout=self.RECOVERY_PERIOD,
)
+ # check if the MON_NETSPLIT warning is raised we expect none
+ # because this is stretch mode
+ self.wait_until_true_and_hold(
+ lambda: not self._check_mon_netsplit_warning(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# check the cluster is accessible
self.wait_until_true_and_hold(
lambda: self._reply_to_mon_command(),
lambda: self._check_if_connect(config),
timeout=self.RECOVERY_PERIOD,
)
+ # check if no MON_NETSPLIT warning is raised
+ self.wait_until_true_and_hold(
+ lambda: not self._check_mon_netsplit_warning(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# check if all the PGs are active+clean
self.wait_until_true_and_hold(
lambda: self._pg_all_active_clean(),
lambda: self._check_if_disconnect(dc1_dc2),
timeout=self.RECOVERY_PERIOD,
)
+ # check if the MON_NETSPLIT warning is raised we expect none
+ # because this is stretch mode
+ self.wait_until_true_and_hold(
+ lambda: not self._check_mon_netsplit_warning(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# check the cluster is accessible
self.wait_until_true_and_hold(
lambda: self._reply_to_mon_command(),
lambda: self._check_if_connect(dc1_dc2),
timeout=self.RECOVERY_PERIOD,
)
+ # check if the MON_NETSPLIT warning is not raised
+ self.wait_until_true_and_hold(
+ lambda: not self._check_mon_netsplit_warning(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# check if all the PGs are active+clean
self.wait_until_true_and_hold(
lambda: self._pg_all_active_clean(),
except Exception:
return False
+ def _check_mon_netsplit_warning(self):
+ """
+ Returns True if MON_NETSPLIT warning exists in health checks.
+ """
+ (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+ arg = ['ceph', 'health', 'detail', '--format=json']
+ proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+ if proc.exitstatus != 0:
+ log.error("ceph health detail failed")
+ raise Exception("ceph health detail failed")
+ out = proc.stdout.getvalue()
+ j = json.loads(out)
+ checks = j.get("checks", {})
+ log.debug("checks: {}".format(checks))
+ return "MON_NETSPLIT" in checks
+
+ def _check_mon_netsplit_warning_raised(self, detail):
+ """
+ Check if the MON_NETSPLIT warning with the given detail is raised.
+ """
+ log.info("Checking if MON_NETSPLIT warning is raised with detail: {}".format(detail))
+ (client,) = self.ctx.cluster.only(self.CLIENT).remotes.keys()
+ arg = ['ceph', 'health', 'detail', '--format=json']
+ proc = client.run(args=arg, wait=True, stdout=StringIO(), timeout=30)
+ if proc.exitstatus != 0:
+ log.error("ceph health detail failed")
+ raise Exception("ceph health detail failed")
+ out = proc.stdout.getvalue()
+ j = json.loads(out)
+ # Access health checks
+ checks = j.get("checks", {})
+ netsplit = checks.get("MON_NETSPLIT", {})
+ if not netsplit:
+ log.info("MON_NETSPLIT not found in health checks")
+ return False
+
+ # Check if the expected detail is present
+ for d in netsplit.get("detail", []):
+ if detail in d.get("message", ""):
+ log.info("Found MON_NETSPLIT warning with detail: {}".format(d))
+ return True
+
+ log.info("MON_NETSPLIT found but detail does not match")
+ return False
+
def test_mon_netsplit(self):
"""
Test the mon netsplit scenario, if cluster is actually accessible.
# Scenario 1: disconnect Site 1 and Site 2
# Site 3 is still connected to both Site 1 and Site 2
config = ["mon.a", "mon.d"]
+ location = ["dc1", "dc2"]
# disconnect the mons
self._disconnect_mons(config)
# wait for the mons to be disconnected
lambda: self._check_if_disconnect(config),
timeout=self.RECOVERY_PERIOD,
)
+ # check if location level MON_NETSPLIT warning is raised
+ self.wait_until_true_and_hold(
+ lambda: self._check_mon_netsplit_warning_raised(
+ "Netsplit detected between {} and {}".format(
+ location[0], location[1]
+ ),
+ ),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# check the cluster is accessible
self.wait_until_true_and_hold(
lambda: self._reply_to_mon_command(),
lambda: self._check_if_connect(config),
timeout=self.RECOVERY_PERIOD,
)
+ # check if the MON_NETSPLIT warning is cleared
+ self.wait_until_true_and_hold(
+ lambda: not self._check_mon_netsplit_warning(),
+ timeout=self.RECOVERY_PERIOD,
+ success_hold_time=self.SUCCESS_HOLD_TIME
+ )
# wait for the PGs to recover
time.sleep(self.RECOVERY_PERIOD)
# check if all PGs are active+clean