From: Rishabh Dave <ridave@redhat.com>
Date: Mon, 25 Mar 2024 12:05:38 +0000 (+0530)
Subject: qa/cephfs: add tests failing MDS and FS when MDS is unhealthy
X-Git-Tag: v19.1.1~238^2~2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=4de11a55a464d79219649c1eb103f8edbede92bb;p=ceph.git

qa/cephfs: add tests failing MDS and FS when MDS is unhealthy

Add tests to verify that the confirmation flag is mandatory for running
commands "ceph mds fail" and "ceph fs fail" when MDS has one of the two
health warnings: MDS_CACHE_OVERSIZE or MDS_TRIM.

Also, add MDS_CACHE_OVERSIZE and MDS_TRIM to ignorelist for
test_admin.py so that QA jobs knows this an expected failure.

Signed-off-by: Rishabh Dave <ridave@redhat.com>
(cherry picked from commit 214d614309a75fe9926d1a46be5b184c7cd0cbc1)
---

diff --git a/qa/suites/fs/functional/tasks/admin.yaml b/qa/suites/fs/functional/tasks/admin.yaml
index c51cc68fed46e..f824c30cd0ee7 100644
--- a/qa/suites/fs/functional/tasks/admin.yaml
+++ b/qa/suites/fs/functional/tasks/admin.yaml
@@ -5,6 +5,8 @@ overrides:
         lockdep: true
     log-ignorelist:
       - missing required features
+      - \(MDS_CACHE_OVERSIZED\)
+      - \(MDS_TRIM\)
 tasks:
   - cephfs_test_runner:
       fail_on_skip: false
diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py
index 8c19ec5c49dee..22dc89cdfdfdd 100644
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@@ -91,6 +91,38 @@ class TestAdminCommands(CephFSTestCase):
         if overwrites:
             self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
 
+    def _get_unhealthy_mds_id(self, health_report, health_warn):
+        '''
+        Return MDS ID for which health warning in "health_warn" has been
+        generated.
+        '''
+        # variable "msg" should hold string something like this -
+        # 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
+        # num_segments: 86
+        msg = health_report['checks'][health_warn]['detail'][0]['message']
+        mds_id = msg.split('(')[0]
+        mds_id = mds_id.replace('mds.', '')
+        return mds_id
+
+    def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
+                              tries=10):
+        errmsg = (f'Expected health warning "{health_warn}" to eventually '
+                  'show up in output of command "ceph health detail". Tried '
+                  f'{tries} times with interval of {sleep} seconds but the '
+                  'health warning didn\'t turn up.')
+
+        with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
+            while proceed():
+                self.get_ceph_cmd_stdout(
+                    f'tell mds.{active_mds_id} cache status')
+
+                health_report = json.loads(self.get_ceph_cmd_stdout(
+                    'health detail --format json'))
+
+                if health_warn in health_report['checks']:
+                    return
+
+
 @classhook('_add_valid_tell')
 class TestValidTell(TestAdminCommands):
     @classmethod
@@ -2106,3 +2138,166 @@ class TestPermErrMsg(CephFSTestCase):
                 args=(f'fs authorize {self.fs.name} {self.CLIENT_NAME} / '
                       f'{wrong_perm}'), retval=self.EXPECTED_ERRNO,
                 errmsgs=self.EXPECTED_ERRMSG)
+
+
+class TestFSFail(TestAdminCommands):
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def test_with_health_warn_oversize_cache(self):
+        '''
+        Test that, when health warning MDS_CACHE_OVERSIZE is present for an
+        MDS, command "ceph fs fail" fails without confirmation flag and passes
+        when confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_trim(self):
+        '''
+        Test that, when health warning MDS_TRIM is present for an MDS, command
+        "ceph fs fail" fails without confirmation flag and passes when
+        confirmation flag is passed.
+        '''
+        health_warn = 'MDS_TRIM'
+        # for generating health warning MDS_TRIM
+        self.config_set('mds', 'mds_debug_subtrees', 'true')
+        # this will really really slow the trimming, so that MDS_TRIM stays
+        # for longer.
+        self.config_set('mds', 'mds_log_trim_decay_rate', '60')
+        self.config_set('mds', 'mds_log_trim_threshold', '1')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_trim'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_with_2_active_MDSs(self):
+        '''
+        Test that, when a CephFS has 2 active MDSs and one of them have either
+        health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
+        fails without confirmation flag and passes when confirmation flag is
+        passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.fs.set_max_mds(2)
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        self.fs.wait_for_daemons()
+        mds1_id, mds2_id = self.fs.get_active_names()
+
+        self.mount_a.open_n_background('.', 400)
+        # MDS ID for which health warning has been generated.
+        self.wait_till_health_warn(health_warn, mds1_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+
+class TestMDSFail(TestAdminCommands):
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def test_with_health_warn_oversize_cache(self):
+        '''
+        Test that, when health warning MDS_CACHE_OVERSIZE is present for an
+        MDS, command "ceph mds fail" fails without confirmation flag and
+        passes when confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_trim(self):
+        '''
+        Test that, when health warning MDS_TRIM is present for an MDS, command
+        "ceph mds fail" fails without confirmation flag and passes when
+        confirmation is passed.
+        '''
+        health_warn = 'MDS_TRIM'
+        # for generating health warning MDS_TRIM
+        self.config_set('mds', 'mds_debug_subtrees', 'true')
+        # this will really really slow the trimming, so that MDS_TRIM stays
+        # for longer.
+        self.config_set('mds', 'mds_log_trim_decay_rate', '60')
+        self.config_set('mds', 'mds_log_trim_threshold', '1')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now...
+        errmsg = 'mds_trim'
+        self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_with_2_active_MDSs(self):
+        '''
+        Test when a CephFS has 2 active MDSs and one of them have either
+        health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
+        fails for both MDSs without confirmation flag and passes for both when
+        confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.fs.set_max_mds(2)
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        self.fs.wait_for_daemons()
+        mds1_id, mds2_id = self.fs.get_active_names()
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, mds1_id)
+
+        health_report = json.loads(self.get_ceph_cmd_stdout('health detail '
+                                                            '--format json'))
+        # MDS ID for which health warning has been generated.
+        hw_mds_id = self._get_unhealthy_mds_id(health_report, health_warn)
+        if mds1_id == hw_mds_id:
+            non_hw_mds_id = mds2_id
+        elif mds2_id == hw_mds_id:
+            non_hw_mds_id = mds1_id
+        else:
+            raise RuntimeError('There are only 2 MDSs right now but apparently'
+                               'health warning was raised for an MDS other '
+                               'than these two. This is definitely an error.')
+
+        # actual testing begins now...
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'mds fail {non_hw_mds_id}', retval=1,
+                              errmsgs=errmsg)
+        self.negtest_ceph_cmd(args=f'mds fail {hw_mds_id}', retval=1,
+                              errmsgs=errmsg)
+        self.run_ceph_cmd('mds fail mds1_id --yes-i-really-mean-it')
+        self.run_ceph_cmd('mds fail mds2_id --yes-i-really-mean-it')