From: Rishabh Dave Date: Tue, 26 Mar 2024 06:04:23 +0000 (+0530) Subject: qa/cephfs: test that counters are not printed for SR MDS X-Git-Tag: v19.1.1~260^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=b2bfb5876fd55196dcee038bff67cc07ffacc89c;p=ceph.git qa/cephfs: test that counters are not printed for SR MDS - Add tests to verify that inode and stray counters are not replayed/included in the health warnings printed for the standby-replay MDS. - Add "MDS_CACHE_OVERSIZED" health warning to ignorelist to failover.yaml. - Add a helper method to qa.tasks.cephfs.filesystem.Filesystem to get MDS name of standby-replay MDS. Signed-off-by: Rishabh Dave (cherry picked from commit 2784e224e7af38d5b96c573df7dfb373de53937b) --- diff --git a/qa/suites/fs/multifs/tasks/failover.yaml b/qa/suites/fs/multifs/tasks/failover.yaml index 9c403c76db6c8..55dde639c237c 100644 --- a/qa/suites/fs/multifs/tasks/failover.yaml +++ b/qa/suites/fs/multifs/tasks/failover.yaml @@ -7,6 +7,7 @@ overrides: - \(MDS_UP_LESS_THAN_MAX\) - \(MDS_DAMAGE\) - \(FS_DEGRADED\) + - \(MDS_CACHE_OVERSIZED\) ceph-fuse: disabled: true tasks: diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index cef55277b5055..dbb3d2f7d9743 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -1113,6 +1113,9 @@ class FilesystemBase(MDSClusterBase): """ return self.get_daemon_names("up:active", status=status) + def get_standby_replay_names(self, status=None): + return self.get_daemon_names('up:standby-replay', status=status) + def get_all_mds_rank(self, status=None): mdsmap = self.get_mds_map(status) result = [] diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 0a3e0a59c057b..2df7fefe18939 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -3,9 +3,11 @@ import signal import logging import operator from random import randint, choice +from json import loads as json_loads from tasks.cephfs.cephfs_test_case import CephFSTestCase from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while from tasks.cephfs.fuse_mount import FuseMount log = logging.getLogger(__name__) @@ -520,7 +522,8 @@ class TestFailover(CephFSTestCase): class TestStandbyReplay(CephFSTestCase): - CLIENTS_REQUIRED = 0 + + CLIENTS_REQUIRED = 1 MDSS_REQUIRED = 4 def _confirm_no_replay(self): @@ -706,6 +709,72 @@ class TestStandbyReplay(CephFSTestCase): status = self._confirm_single_replay() self.assertTrue(standby_count, len(list(status.get_standbys()))) + def test_health_warn_oversize_cache_has_no_counters(self): + ''' + Test that when MDS cache size crosses the limit, health warning + printed for standy-replay MDS doesn't include inode and stray + counters. + + Tests: https://tracker.ceph.com/issues/63514 + ''' + # reduce MDS cache limit, default MDS cache limit is too high which + # will unnecessarily consume too many resources and too much time. + self.config_set('mds', 'mds_cache_memory_limit', '1K') + # health warning for crossing MDS cache size limit won't be raised + # until a threshold. default threshold is too high. it will + # unnecessarily consume so much time and resources. + self.config_set('mds', 'mds_health_cache_threshold', '1.000001') + # so that there is only active MDS and only 1 health warning is + # produced. presence of 2 warning should cause this test to fail + self.fs.set_max_mds(1) + self.fs.set_allow_standby_replay(True) + self._confirm_single_replay() + self.fs.wait_for_daemons() + # The call above (to self.fs.wait_for_daemons()) should ensure we have + # only 1 active MDS on cluster + active_mds_id = self.fs.get_active_names()[0] + sr_mds_id = self.fs.get_standby_replay_names()[0] + + # this should generate more than enough MDS cache to trigger health + # warning MDS_CACHE_OVERSIZED. + self.mount_a.open_n_background(".", 400) + + # actual test begins now... + with safe_while(sleep=3, tries=10) as proceed: + while proceed(): + # logging cache generated so far for th sake of easy + # debugging in future. + self.get_ceph_cmd_stdout(f'tell mds.{active_mds_id} cache ' + 'status') + + health_report = self.get_ceph_cmd_stdout('health detail ' + '--format json') + health_report = json_loads(health_report) + if 'MDS_CACHE_OVERSIZED' not in health_report['checks']: + log.debug('warning hasn\'t appeared in health report yet.' + 'trying again after some sleep...') + continue + + cache_warn = health_report['checks']['MDS_CACHE_OVERSIZED']\ + ['detail'] + log.debug(f'cache_warn - {cache_warn}') + # sanity check: "ceph health detail" output should've 2 + # warnings -- one for active MDS and other for standby-replay + # MDS. + if len(cache_warn) != 2: + log.debug('expected 2 warnings but instead found ' + f'{len(cache_warn)} warnings; trying again ' + 'after some sleep...') + continue + + for cw in cache_warn: + msg = cw['message'] + if f'mds.{sr_mds_id}' not in cw['message']: + continue + self.assertNotIn('inodes in use by clients', msg) + self.assertNotIn('stray files', msg) + return + class TestMultiFilesystems(CephFSTestCase): CLIENTS_REQUIRED = 2