From: Nitzan Mordechai Date: Thu, 18 May 2023 13:37:38 +0000 (+0000) Subject: test: monitor thrasher wait until quorum X-Git-Tag: v19.0.0~1123^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fbd10badbfad71f208de6b48008a20963d375ae9;p=ceph.git test: monitor thrasher wait until quorum With 1 sec. delay we may sometimes fail to get correct length of quorum since the monitor didn't updated on time. With the following fix, we will wait for quorum and check every few seconds (3) until timeout (30). Fixes: https://tracker.ceph.com/issues/52316 Signed-off-by: Nitzan Mordechai --- diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py index 4224acf0319..30a7555b55a 100644 --- a/qa/tasks/mon_thrash.py +++ b/qa/tasks/mon_thrash.py @@ -9,6 +9,7 @@ import gevent import json import math from teuthology import misc as teuthology +from teuthology.contextutil import safe_while from tasks import ceph_manager from tasks.cephfs.filesystem import MDSCluster from tasks.thrasher import Thrasher @@ -224,6 +225,25 @@ class MonitorThrasher(Thrasher): else: return m + def _wait_until_quorum(self, mon, size, timeout=300): + """ + Wait until the monitor specified is in the quorum. + """ + self.log('waiting for quorum size %d for mon %s' % (size, mon)) + s = {} + + with safe_while(sleep=3, + tries=timeout // 3, + action=f'wait for quorum size {size} on mon {mon}') as proceed: + while proceed(): + s = self.manager.get_mon_status(mon) + if len(s['quorum']) == size: + break + self.log("quorum is size %d" % len(s['quorum'])) + + self.log("final quorum is size %d" % len(s['quorum'])) + return s + def do_thrash(self): """ _do_thrash() wrapper. @@ -261,7 +281,11 @@ class MonitorThrasher(Thrasher): self.manager.wait_for_mon_quorum_size(len(mons)) self.log('making sure all monitors are in the quorum') for m in mons: - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons) @@ -300,7 +324,12 @@ class MonitorThrasher(Thrasher): for m in mons: if m in mons_to_kill: continue - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons)-len(mons_to_kill), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) + assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons)-len(mons_to_kill) @@ -322,7 +351,12 @@ class MonitorThrasher(Thrasher): self.manager.wait_for_mon_quorum_size(len(mons)) for m in mons: - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) + assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons)