]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
Merge pull request #57492 from rishabh-d-dave/qa-fs-mds-fail-improve
authorRishabh Dave <ridave@redhat.com>
Thu, 11 Jul 2024 11:07:16 +0000 (16:37 +0530)
committerGitHub <noreply@github.com>
Thu, 11 Jul 2024 11:07:16 +0000 (16:37 +0530)
qa/cephfs: improvements for "mds fail" and "fs fail"

Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
1  2 
qa/tasks/cephfs/test_admin.py

index b4bc504547a7a6c95d70b1abe1cfc7ef2c27576a,4d982741135666d3fb3b7b0e822bd4db51b9e1e0..a321c0e3457a64e2f5b548e4a50ee0fb8fcbf431
@@@ -165,140 -91,29 +165,131 @@@ class TestAdminCommands(CephFSTestCase)
          if overwrites:
              self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
  
-     def _get_unhealthy_mds_id(self, health_report, health_warn):
-         '''
-         Return MDS ID for which health warning in "health_warn" has been
-         generated.
-         '''
-         # variable "msg" should hold string something like this -
-         # 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
-         # num_segments: 86
-         msg = health_report['checks'][health_warn]['detail'][0]['message']
-         mds_id = msg.split('(')[0]
-         mds_id = mds_id.replace('mds.', '')
-         return mds_id
+     def gen_health_warn_mds_cache_oversized(self):
+         health_warn = 'MDS_CACHE_OVERSIZED'
  
-     def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
-                               tries=10):
-         errmsg = (f'Expected health warning "{health_warn}" to eventually '
-                   'show up in output of command "ceph health detail". Tried '
-                   f'{tries} times with interval of {sleep} seconds but the '
-                   'health warning didn\'t turn up.')
+         self.config_set('mds', 'mds_cache_memory_limit', '1K')
+         self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+         self.mount_a.open_n_background('.', 400)
  
-         with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
-             while proceed():
-                 self.get_ceph_cmd_stdout(
-                     f'tell mds.{active_mds_id} cache status')
+         self.wait_for_health(health_warn, 30)
  
-                 health_report = json.loads(self.get_ceph_cmd_stdout(
-                     'health detail --format json'))
+     def gen_health_warn_mds_trim(self):
+         health_warn = 'MDS_TRIM'
+         # for generating health warning MDS_TRIM
+         self.config_set('mds', 'mds_debug_subtrees', 'true')
+         # this will really really slow the trimming, so that MDS_TRIM stays
+         # for longer.
+         self.config_set('mds', 'mds_log_trim_decay_rate', '60')
+         self.config_set('mds', 'mds_log_trim_threshold', '1')
+         self.mount_a.open_n_background('.', 400)
  
-                 if health_warn in health_report['checks']:
-                     return
+         self.wait_for_health(health_warn, 30)
  
  
 +class TestMdsLastSeen(CephFSTestCase):
 +    """
 +    Tests for `mds last-seen` command.
 +    """
 +
 +    MDSS_REQUIRED = 2
 +
 +    def test_in_text(self):
 +        """
 +        That `mds last-seen` returns 0 for an MDS currently in the map.
 +        """
 +
 +        status = self.fs.status()
 +        r0 = self.fs.get_rank(0, status=status)
 +        s = self.get_ceph_cmd_stdout("mds", "last-seen", r0['name'])
 +        seconds = int(re.match(r"^(\d+)s$", s).group(1))
 +        self.assertEqual(seconds, 0)
 +
 +    def test_in_json(self):
 +        """
 +        That `mds last-seen` returns 0 for an MDS currently in the map.
 +        """
 +
 +        status = self.fs.status()
 +        r0 = self.fs.get_rank(0, status=status)
 +        s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name'])
 +        J = json.loads(s)
 +        seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1))
 +        self.assertEqual(seconds, 0)
 +
 +    def test_unknown(self):
 +        """
 +        That `mds last-seen` returns ENOENT for an mds not in recent maps.
 +        """
 +
 +        try:
 +            self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", 'foo')
 +        except CommandFailedError as e:
 +            self.assertEqual(e.exitstatus, errno.ENOENT)
 +        else:
 +            self.fail("non-existent mds should fail ENOENT")
 +
 +    def test_standby(self):
 +        """
 +        That `mds last-seen` returns 0 for a standby.
 +        """
 +
 +        status = self.fs.status()
 +        for info in status.get_standbys():
 +            s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", info['name'])
 +            J = json.loads(s)
 +            seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1))
 +            self.assertEqual(seconds, 0)
 +
 +    def test_stopped(self):
 +        """
 +        That `mds last-seen` returns >0 for mds that is stopped.
 +        """
 +
 +        status = self.fs.status()
 +        r0 = self.fs.get_rank(0, status=status)
 +        self.fs.mds_stop(mds_id=r0['name'])
 +        self.fs.rank_fail()
 +        sleep(2)
 +        with safe_while(sleep=1, tries=self.fs.beacon_timeout, action='wait for last-seen >0') as proceed:
 +            while proceed():
 +                s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name'])
 +                J = json.loads(s)
 +                seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1))
 +                if seconds == 0:
 +                    continue
 +                self.assertGreater(seconds, 1)
 +                break
 +
 +    def test_gc(self):
 +        """
 +        That historical mds information is eventually garbage collected.
 +        """
 +
 +        prune_time = 20
 +        sleep_time = 2
 +        self.config_set('mon', 'mon_fsmap_prune_threshold', prune_time)
 +        status = self.fs.status()
 +        r0 = self.fs.get_rank(0, status=status)
 +        self.fs.mds_stop(mds_id=r0['name'])
 +        self.fs.rank_fail()
 +        last = 0
 +        for i in range(prune_time):
 +            sleep(sleep_time) # we will sleep twice prune_time
 +            try:
 +                s = self.get_ceph_cmd_stdout("--format=json", "mds", "last-seen", r0['name'])
 +                J = json.loads(s)
 +                seconds = int(re.match(r"^(\d+)s$", J['last-seen']).group(1))
 +                self.assertGreater(seconds, last)
 +                log.debug("last_seen: %ds", seconds)
 +                last = seconds
 +            except CommandFailedError as e:
 +                self.assertEqual(e.exitstatus, errno.ENOENT)
 +                self.assertGreaterEqual(last + sleep_time + 1, prune_time) # rounding error add 1
 +                return
 +        self.fail("map was no garbage collected as expected")
 +
  @classhook('_add_valid_tell')
  class TestValidTell(TestAdminCommands):
      @classmethod