]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/cephfs: test that counters are not printed for SR MDS 57795/head
authorRishabh Dave <ridave@redhat.com>
Tue, 26 Mar 2024 06:04:23 +0000 (11:34 +0530)
committerRishabh Dave <ridave@redhat.com>
Thu, 30 May 2024 18:20:57 +0000 (23:50 +0530)
- Add tests to verify that inode and stray counters are not
  replayed/included in the health warnings printed for the
  standby-replay MDS.

- Add "MDS_CACHE_OVERSIZED" health warning to ignorelist to
  failover.yaml.

- Add a helper method to qa.tasks.cephfs.filesystem.Filesystem to get
  MDS name of standby-replay MDS.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
(cherry picked from commit 2784e224e7af38d5b96c573df7dfb373de53937b)

qa/suites/fs/multifs/tasks/failover.yaml
qa/tasks/cephfs/filesystem.py
qa/tasks/cephfs/test_failover.py

index 9c403c76db6c8b48f49e8977bbd7f52285b72c10..55dde639c237c074215c706e8cebec6f6520c05e 100644 (file)
@@ -7,6 +7,7 @@ overrides:
       - \(MDS_UP_LESS_THAN_MAX\)
       - \(MDS_DAMAGE\)
       - \(FS_DEGRADED\)
+      - \(MDS_CACHE_OVERSIZED\)
   ceph-fuse:
     disabled: true
 tasks:
index cef55277b5055bccbfe3754249d1bf15aa120adb..dbb3d2f7d9743a8aa3fa12914a12f84c26fa1925 100644 (file)
@@ -1113,6 +1113,9 @@ class FilesystemBase(MDSClusterBase):
         """
         return self.get_daemon_names("up:active", status=status)
 
+    def get_standby_replay_names(self, status=None):
+        return self.get_daemon_names('up:standby-replay', status=status)
+
     def get_all_mds_rank(self, status=None):
         mdsmap = self.get_mds_map(status)
         result = []
index 0a3e0a59c057bdb3d783df10451c78bb9667bfdc..2df7fefe18939000382aca12347daeef68b98425 100644 (file)
@@ -3,9 +3,11 @@ import signal
 import logging
 import operator
 from random import randint, choice
+from json import loads as json_loads
 
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 from teuthology.exceptions import CommandFailedError
+from teuthology.contextutil import safe_while
 from tasks.cephfs.fuse_mount import FuseMount
 
 log = logging.getLogger(__name__)
@@ -520,7 +522,8 @@ class TestFailover(CephFSTestCase):
 
 
 class TestStandbyReplay(CephFSTestCase):
-    CLIENTS_REQUIRED = 0
+
+    CLIENTS_REQUIRED = 1
     MDSS_REQUIRED = 4
 
     def _confirm_no_replay(self):
@@ -706,6 +709,72 @@ class TestStandbyReplay(CephFSTestCase):
         status = self._confirm_single_replay()
         self.assertTrue(standby_count, len(list(status.get_standbys())))
 
+    def test_health_warn_oversize_cache_has_no_counters(self):
+        '''
+        Test that when MDS cache size crosses the limit, health warning
+        printed for standy-replay MDS doesn't include inode and stray
+        counters.
+
+        Tests: https://tracker.ceph.com/issues/63514
+        '''
+        # reduce MDS cache limit, default MDS cache limit is too high which
+        # will unnecessarily consume too many resources and too much time.
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        # health warning for crossing MDS cache size limit won't be raised
+        # until a threshold. default threshold is too high. it will
+        # unnecessarily consume so much time and resources.
+        self.config_set('mds', 'mds_health_cache_threshold', '1.000001')
+        # so that there is only active MDS and only 1 health warning is
+        # produced. presence of 2 warning should cause this test to fail
+        self.fs.set_max_mds(1)
+        self.fs.set_allow_standby_replay(True)
+        self._confirm_single_replay()
+        self.fs.wait_for_daemons()
+        # The call above (to self.fs.wait_for_daemons()) should ensure we have
+        # only 1 active MDS on cluster
+        active_mds_id = self.fs.get_active_names()[0]
+        sr_mds_id = self.fs.get_standby_replay_names()[0]
+
+        # this should generate more than enough MDS cache to trigger health
+        # warning MDS_CACHE_OVERSIZED.
+        self.mount_a.open_n_background(".", 400)
+
+        # actual test begins now...
+        with safe_while(sleep=3, tries=10) as proceed:
+            while proceed():
+                # logging cache generated so far for th sake of easy
+                # debugging in future.
+                self.get_ceph_cmd_stdout(f'tell mds.{active_mds_id} cache '
+                                          'status')
+
+                health_report = self.get_ceph_cmd_stdout('health detail '
+                                                         '--format json')
+                health_report = json_loads(health_report)
+                if 'MDS_CACHE_OVERSIZED' not in health_report['checks']:
+                    log.debug('warning hasn\'t appeared in health report yet.'
+                             'trying again after some sleep...')
+                    continue
+
+                cache_warn = health_report['checks']['MDS_CACHE_OVERSIZED']\
+                        ['detail']
+                log.debug(f'cache_warn - {cache_warn}')
+                # sanity check: "ceph health detail" output should've 2
+                # warnings -- one for active MDS and other for standby-replay
+                # MDS.
+                if len(cache_warn) != 2:
+                    log.debug('expected 2 warnings but instead found '
+                              f'{len(cache_warn)} warnings; trying again '
+                               'after some sleep...')
+                    continue
+
+                for cw in cache_warn:
+                    msg = cw['message']
+                    if f'mds.{sr_mds_id}' not in cw['message']:
+                        continue
+                    self.assertNotIn('inodes in use by clients', msg)
+                    self.assertNotIn('stray files', msg)
+                    return
+
 
 class TestMultiFilesystems(CephFSTestCase):
     CLIENTS_REQUIRED = 2