From: Rishabh Dave Date: Thu, 11 Jul 2024 11:07:16 +0000 (+0530) Subject: Merge pull request #57492 from rishabh-d-dave/qa-fs-mds-fail-improve X-Git-Tag: v20.0.0~1549 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=384acdeb479b8cedcc70ab2db8e6cf783e7741bc;p=ceph.git Merge pull request #57492 from rishabh-d-dave/qa-fs-mds-fail-improve qa/cephfs: improvements for "mds fail" and "fs fail" Reviewed-by: Patrick Donnelly --- 384acdeb479b8cedcc70ab2db8e6cf783e7741bc diff --cc qa/tasks/cephfs/test_admin.py index b4bc504547a7,4d9827411356..a321c0e3457a --- a/qa/tasks/cephfs/test_admin.py +++ b/qa/tasks/cephfs/test_admin.py @@@ -165,140 -91,29 +165,131 @@@ class TestAdminCommands(CephFSTestCase) if overwrites: self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true') - def _get_unhealthy_mds_id(self, health_report, health_warn): - ''' - Return MDS ID for which health warning in "health_warn" has been - generated. - ''' - # variable "msg" should hold string something like this - - # 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10, - # num_segments: 86 - msg = health_report['checks'][health_warn]['detail'][0]['message'] - mds_id = msg.split('(')[0] - mds_id = mds_id.replace('mds.', '') - return mds_id + def gen_health_warn_mds_cache_oversized(self): + health_warn = 'MDS_CACHE_OVERSIZED' - def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3, - tries=10): - errmsg = (f'Expected health warning "{health_warn}" to eventually ' - 'show up in output of command "ceph health detail". Tried ' - f'{tries} times with interval of {sleep} seconds but the ' - 'health warning didn\'t turn up.') + self.config_set('mds', 'mds_cache_memory_limit', '1K') + self.config_set('mds', 'mds_health_cache_threshold', '1.00000') + self.mount_a.open_n_background('.', 400) - with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed: - while proceed(): - self.get_ceph_cmd_stdout( - f'tell mds.{active_mds_id} cache status') + self.wait_for_health(health_warn, 30) - health_report = json.loads(self.get_ceph_cmd_stdout( - 'health detail --format json')) + def gen_health_warn_mds_trim(self): + health_warn = 'MDS_TRIM' + + # for generating health warning MDS_TRIM + self.config_set('mds', 'mds_debug_subtrees', 'true') + # this will really really slow the trimming, so that MDS_TRIM stays + # for longer. + self.config_set('mds', 'mds_log_trim_decay_rate', '60') + self.config_set('mds', 'mds_log_trim_threshold', '1') + self.mount_a.open_n_background('.', 400) - if health_warn in health_report['checks']: - return + self.wait_for_health(health_warn, 30) +class TestMdsLastSeen(CephFSTestCase): + """ + Tests for `mds last-seen` command. + """ + + MDSS_REQUIRED = 2 + + def test_in_text(self): + """ + That `mds last-seen` returns 0 for an MDS currently in the map. + """ + + status = self.fs.status() + r0 = self.fs.get_rank(0, status=status) + s = self.get_ceph_cmd_stdout("mds", "last-seen", r0['name']) + seconds = int(re.match(r"^(\d+)s$", s).group(1)) + self.assertEqual(seconds, 0) + + def test_in_json(self): + """ + That `mds last-seen` returns 0 for an MDS currently in the map. + """ + + status = self.fs.status() + r0 = self.fs.get_rank(0, status=status) + s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name']) + J = json.loads(s) + seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1)) + self.assertEqual(seconds, 0) + + def test_unknown(self): + """ + That `mds last-seen` returns ENOENT for an mds not in recent maps. + """ + + try: + self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", 'foo') + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("non-existent mds should fail ENOENT") + + def test_standby(self): + """ + That `mds last-seen` returns 0 for a standby. + """ + + status = self.fs.status() + for info in status.get_standbys(): + s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", info['name']) + J = json.loads(s) + seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1)) + self.assertEqual(seconds, 0) + + def test_stopped(self): + """ + That `mds last-seen` returns >0 for mds that is stopped. + """ + + status = self.fs.status() + r0 = self.fs.get_rank(0, status=status) + self.fs.mds_stop(mds_id=r0['name']) + self.fs.rank_fail() + sleep(2) + with safe_while(sleep=1, tries=self.fs.beacon_timeout, action='wait for last-seen >0') as proceed: + while proceed(): + s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name']) + J = json.loads(s) + seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1)) + if seconds == 0: + continue + self.assertGreater(seconds, 1) + break + + def test_gc(self): + """ + That historical mds information is eventually garbage collected. + """ + + prune_time = 20 + sleep_time = 2 + self.config_set('mon', 'mon_fsmap_prune_threshold', prune_time) + status = self.fs.status() + r0 = self.fs.get_rank(0, status=status) + self.fs.mds_stop(mds_id=r0['name']) + self.fs.rank_fail() + last = 0 + for i in range(prune_time): + sleep(sleep_time) # we will sleep twice prune_time + try: + s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name']) + J = json.loads(s) + seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1)) + self.assertGreater(seconds, last) + log.debug("last_seen: %ds", seconds) + last = seconds + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + self.assertGreaterEqual(last + sleep_time + 1, prune_time) # rounding error add 1 + return + self.fail("map was no garbage collected as expected") + @classhook('_add_valid_tell') class TestValidTell(TestAdminCommands): @classmethod