qa: test_pool_min_size should kill osds first then mark them down

author Bill Scales <bill_scales@uk.ibm.com>

Sun, 17 Aug 2025 15:42:11 +0000 (16:42 +0100)

committer Bill Scales <bill_scales@uk.ibm.com>

Sun, 17 Aug 2025 15:54:26 +0000 (16:54 +0100)
author Bill Scales <bill_scales@uk.ibm.com>
Sun, 17 Aug 2025 15:42:11 +0000 (16:42 +0100)
committer Bill Scales <bill_scales@uk.ibm.com>
Sun, 17 Aug 2025 15:54:26 +0000 (16:54 +0100)
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 0f7e92c5c2fb98af3c1dacbc43656e7fe53545e1..386727851637f01dafd468b8ce8efa1399134822 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -1046,8 +1046,19 @@ class OSDThrasher(Thrasher):
                  self.log("chose to kill {n} OSDs".format(n=most_killable))
                  acting_set = self.get_rand_pg_acting_set(pool_id)
                  assert most_killable < len(acting_set)
+                # kill the selected osds first, then mark them out. This makes
+                # the error inject a single 'atomic' failure. It simulates what
+                # happens if multiple OSDs fail over a couple of minutes and
+                # then the mon marks them out mon_osd_down_out_interval (10 mins)
+                # later. In contrast if each osd is killed and marked out in turn
+                # then this simulates a rolling failure, here rebalancing and
+                # async recovery can start after the first osd is marked out
+                # further reducing redundancy. With this number of injects in
+                # quick succession this risks a PG in the pool becoming dead
                  for i in range(0, most_killable):
-                    self.kill_osd(osd=acting_set[i], mark_out=True)
+                    self.kill_osd(osd=acting_set[i])
+                for i in range(0, most_killable):
+                    self.out_osd(osd=acting_set[i])
                  self.log("dead_osds={d}, live_osds={ld}".format(d=self.dead_osds, ld=self.live_osds))
                  with safe_while(
                      sleep=25, tries=5,
author	Bill Scales <bill_scales@uk.ibm.com>
	Sun, 17 Aug 2025 15:42:11 +0000 (16:42 +0100)
committer	Bill Scales <bill_scales@uk.ibm.com>
	Sun, 17 Aug 2025 15:54:26 +0000 (16:54 +0100)