From 9600152593fe4d8f40c7a6b071aa9c905c8cc6cb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Jan 2020 09:45:00 -0600 Subject: [PATCH] qa/tasks/ceph_manager: fix post-osd-kill pg peered check This was asserting that all PGs are active or peered, but that assertion could fail if the concurrent workload created a new pool. Switch to a loop that checks several times for the condition to be true. Fixes: https://tracker.ceph.com/issues/43656 Signed-off-by: Sage Weil --- qa/tasks/ceph_manager.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index f83b5d58a796b..d7257c80b4f81 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -842,9 +842,16 @@ class OSDThrasher(Thrasher): self.log("chose to kill {n} OSDs".format(n=most_killable)) for i in range(1, most_killable): self.kill_osd(mark_out=True) - time.sleep(15) - assert self.ceph_manager.all_active_or_peered(), \ - 'not all PGs are active or peered 15 seconds after marking out OSDs' + time.sleep(10) + # try a few times since there might be a concurrent pool + # creation or deletion + with safe_while( + sleep=5, tries=5, + action='check for active or peered') as proceed: + while proceed(): + if self.ceph_manager.all_active_or_peered(): + break + self.log('not all PGs are active or peered') else: # chose to revive OSDs, bring up a random fraction of the dead ones self.log("chose to revive osds") for i in range(1, int(rand_val * len(self.dead_osds))): -- 2.39.5