From: Nitzan Mordechai Date: Wed, 26 Jun 2024 06:06:23 +0000 (+0000) Subject: suites: host thrasher should check min_in before thrashing host X-Git-Tag: v20.0.0~1461^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=66b42c3f5c014cd53d01424fe97d3568d626a482;p=ceph.git suites: host thrasher should check min_in before thrashing host We need to check if taking host out will cause the total in osds to be less then min_in Fixes: https://tracker.ceph.com/issues/66657 Signed-off-by: Nitzan Mordechai --- diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 4486e554ed587..163d172eb118b 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -569,23 +569,45 @@ class OSDThrasher(Thrasher): def out_host(self, host=None): """ - Make all osds on a host out + Make all OSDs on a host out if the host has more than min_in OSDs. :param host: Host to be marked. """ - # check that all osd remotes have a valid console + # Check that all OSD remotes have a valid console osds = self.ceph_manager.ctx.cluster.only(teuthology.is_type('osd', self.ceph_manager.cluster)) - if host is None: - host = random.choice(list(osds.remotes.keys())) - self.log("Removing all osds in host %s" % (host,)) - - for role in osds.remotes[host]: - if not role.startswith("osd."): - continue - osdid = int(role.split('.')[1]) - if self.in_osds.count(osdid) == 0: - continue - self.out_osd(osdid) + all_hosts = list(osds.remotes.keys()) + min_in = self.minin + + if host is not None: + all_hosts = [host] if host in all_hosts else [] + + random.shuffle(all_hosts) # Shuffle the list to pick hosts randomly + + for host in all_hosts: + self.log("Checking the number of in OSDs in host %s" % (host,)) + + # Count the number of in OSDs in the host + in_host_osd_count = 0 + for role in osds.remotes[host]: + if role.startswith("osd."): + osdid = int(role.split('.')[1]) + if osdid in self.in_osds: + in_host_osd_count += 1 + + # Check taking out that host will cause the number + # of in OSDs to be less than min_in + if len(self.in_osds) - in_host_osd_count >= min_in: + self.log("Removing all OSDs in host %s" % (host,)) + # Proceed to take out OSDs + for role in osds.remotes[host]: + if role.startswith("osd."): + osdid = int(role.split('.')[1]) + if osdid in self.in_osds: + self.out_osd(osdid) + return + else: + self.log("Host %s can't be trashed as it will left %d OSDs in" % (host, len(self.in_osds) - in_host_osd_count)) + self.log("No suitable host found to thrash") def out_osd(self, osd=None): """ @@ -1254,7 +1276,6 @@ class OSDThrasher(Thrasher): (minin, minout, minlive, mindead, chance_down)) actions = [] if thrash_hosts: - self.log("check thrash_hosts") if len(self.in_osds) > minin: self.log("check thrash_hosts: in_osds > minin") actions.append((self.out_host, 1.0,))