A journal flush sometimes takes more than 120 seconds and so the 'scrub
status' command after blocking for more than 120 seconds is declared
failed causing the job to be declared as failed.
This bumping up of the timeout gives more time for the 'scrub status'
command to wait and eventually let the journal flush to complete.
Fixes: https://tracker.ceph.com/issues/63411
Signed-off-by: Milind Changire <mchangir@redhat.com>
(cherry picked from commit
33899fdaac57db5266940f59f5ef956932aa1714)
Conflicts:
qa/tasks/ceph_manager.py
- fixed diff between main and reef
qa/tasks/cephfs/filesystem.py
- fixed diff between main and reef
qa/tasks/vstart_runner.py
- fixed diff between main and reef
self.cephadm = cephadm
self.testdir = teuthology.get_testdir(self.ctx)
# prefix args for ceph cmds to be executed
- pre = ['adjust-ulimits', 'ceph-coverage',
- f'{self.testdir}/archive/coverage']
- self.CEPH_CMD = ['sudo'] + pre + ['timeout', '120', 'ceph',
- '--cluster', self.cluster]
- self.RADOS_CMD = pre + ['rados', '--cluster', self.cluster]
+ self.pre = ['adjust-ulimits', 'ceph-coverage',
+ f'{self.testdir}/archive/coverage']
+ self.RADOS_CMD = self.pre + ['rados', '--cluster', self.cluster]
self.run_ceph_w_prefix = ['sudo', 'daemon-helper', 'kill', 'ceph',
'--cluster', self.cluster]
except CommandFailedError:
self.log('Failed to get pg_num from pool %s, ignoring' % pool)
+ def get_ceph_cmd(self, **kwargs):
+ timeout = kwargs.pop('timeout', 120)
+ return ['sudo'] + self.pre + ['timeout', f'{timeout}', 'ceph',
+ '--cluster', self.cluster]
+
def ceph(self, cmd, **kwargs):
"""
Simple Ceph admin command wrapper around run_cluster_cmd.
stdout=StringIO(),
check_status=kwargs.get('check_status', True))
else:
- kwargs['args'] = prefixcmd + self.CEPH_CMD + kwargs['args']
+ kwargs['args'] = prefixcmd + self.get_ceph_cmd(**kwargs) + kwargs['args']
return self.controller.run(**kwargs)
def raw_cluster_cmd(self, *args, **kwargs) -> str:
info = self.get_rank(rank=rank, status=status)
return self.json_asok(command, 'mds', info['name'], timeout=timeout)
- def rank_tell(self, command, rank=0, status=None):
+ def rank_tell(self, command, rank=0, status=None, timeout=120):
try:
- out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command)
+ out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command, timeout=timeout)
return json.loads(out)
except json.decoder.JSONDecodeError:
log.error("could not decode: {}".format(out))
self.set_max_mds(new_max_mds)
return self.wait_for_daemons()
- def run_scrub(self, cmd, rank=0):
- return self.rank_tell(["scrub"] + cmd, rank)
+ def run_scrub(self, cmd, rank=0, timeout=300):
+ return self.rank_tell(["scrub"] + cmd, rank=rank, timeout=timeout)
def get_scrub_status(self, rank=0):
- return self.run_scrub(["status"], rank)
+ return self.run_scrub(["status"], rank=rank, timeout=300)
def flush(self, rank=0):
return self.rank_tell(["flush", "journal"], rank=rank)
result = "no active scrubs running"
with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
while proceed():
- out_json = self.rank_tell(["scrub", "status"], rank=rank)
+ out_json = self.rank_tell(["scrub", "status"], rank=rank, timeout=timeout)
assert out_json is not None
if not reverse:
if result in out_json['status']:
self.rook = False
self.testdir = None
self.run_ceph_w_prefix = self.run_cluster_cmd_prefix = [CEPH_CMD]
- self.CEPH_CMD = [CEPH_CMD]
self.RADOS_CMD = [RADOS_CMD]
+ def get_ceph_cmd(self, **kwargs):
+ return [CEPH_CMD]
+
def find_remote(self, daemon_type, daemon_id):
"""
daemon_type like 'mds', 'osd'