]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/cephfs: lazy-force unmount clients that have been evicted 46988/head
authorRishabh Dave <ridave@redhat.com>
Wed, 20 Jul 2022 09:46:53 +0000 (15:16 +0530)
committerRishabh Dave <ridave@redhat.com>
Fri, 19 Aug 2022 15:30:00 +0000 (21:00 +0530)
Before unmounting check if the client has been evicted and, if so, run
"umount -f -l" for the mount point of the client and cleanup the mount
right after it.

Attempting to unmount, cleanup or operate in any way over mount point
of a evicted client will hang the operation (and thereby our Python
code too). Lazy-force unmount prevents such hangs for our Python code
and also frees the mount point.

This commit also adds code to gather session info for kernel mounts
after mounting is successful. This is a necessity since network address
of session is needed to check if it is blocked by Ceph cluster.

Fixes: https://tracker.ceph.com/issues/56476
Signed-off-by: Rishabh Dave <ridave@redhat.com>
qa/tasks/cephfs/fuse_mount.py
qa/tasks/cephfs/kernel_mount.py
qa/tasks/cephfs/mount.py
qa/tasks/vstart_runner.py

index fa0d84b842b3bf4722eb569ac14dca207e63424b..7f8b11017ec174ade03d9d04088af3e44f8b3c09 100644 (file)
@@ -312,6 +312,11 @@ class FuseMount(CephFSMount):
             if cleanup:
                 self.cleanup()
             return
+        if self.is_blocked():
+            self._run_umount_lf()
+            if cleanup:
+                self.cleanup()
+            return
 
         try:
             log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
@@ -344,15 +349,8 @@ class FuseMount(CephFSMount):
                     """).format(self._fuse_conn))
                     self._fuse_conn = None
 
-                stderr = StringIO()
                 # make sure its unmounted
-                try:
-                    self.client_remote.run(
-                        args=['sudo', 'umount', '-l', '-f', self.hostfs_mntpt],
-                        stderr=stderr, timeout=UMOUNT_TIMEOUT, omit_sudo=False)
-                except CommandFailedError:
-                    if self.is_mounted():
-                        raise
+                self._run_umount_lf()
 
         self._fuse_conn = None
         self.id = None
@@ -386,6 +384,11 @@ class FuseMount(CephFSMount):
             # mount -o remount (especially if the remount is stuck because MDSs
             # are unavailable)
 
+        if self.is_blocked():
+            self._run_umount_lf()
+            self.cleanup()
+            return
+
         # cleanup is set to to fail since clieanup must happen after umount is
         # complete; otherwise following call to run.wait hangs.
         self.umount(cleanup=False)
index c74e637d4416e7f0272bd2a9ae074b7634202312..e1948926701df00fc87ff73d31a154ebf92482f7 100644 (file)
@@ -62,6 +62,13 @@ class KernelMount(CephFSMount):
                 self.enable_dynamic_debug()
             self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1
 
+        self.gather_mount_info()
+
+    def gather_mount_info(self):
+        self.id = self._get_global_id()
+        self.get_global_inst()
+        self.get_global_addr()
+
     def _run_mount_cmd(self, mntopts, check_status):
         mount_cmd = self._get_mount_cmd(mntopts)
         mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO()
@@ -133,6 +140,11 @@ class KernelMount(CephFSMount):
             self.cleanup()
             return
 
+        if self.is_blocked():
+            self._run_umount_lf()
+            self.cleanup()
+            return
+
         log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
 
         try:
@@ -173,11 +185,7 @@ class KernelMount(CephFSMount):
                 raise
 
             # force delete the netns and umount
-            log.debug('Force/lazy unmounting on client.{id}...'.format(id=self.client_id))
-            self.client_remote.run(args=['sudo', 'umount', '-f', '-l',
-                                         self.mountpoint], timeout=timeout,
-                                   omit_sudo=False)
-
+            self._run_umount_lf()
             self.cleanup()
 
     def wait_until_mounted(self):
index 89dbe5111208c73200c1da96aa053911ef1674d5..874b27f4f60d5f06b56a14b5fcab1544aa466253 100644 (file)
@@ -168,6 +168,12 @@ class CephFSMount(object):
                      get_file(self.client_remote, self.client_keyring_path,
                               sudo=True).decode())
 
+    def is_blocked(self):
+        self.fs = Filesystem(self.ctx, name=self.cephfs_name)
+
+        output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
+        return self.addr in output
+
     def is_stuck(self):
         """
         Check if mount is stuck/in a hanged state.
@@ -473,6 +479,19 @@ class CephFSMount(object):
         self.mount(**kwargs)
         self.wait_until_mounted()
 
+    def _run_umount_lf(self):
+        log.debug(f'Force/lazy unmounting on client.{self.client_id}')
+
+        try:
+            proc = self.client_remote.run(
+                args=f'sudo umount --lazy --force {self.hostfs_mntpt}',
+                timeout=UMOUNT_TIMEOUT, omit_sudo=False)
+        except CommandFailedError:
+            if self.is_mounted():
+                raise
+
+        return proc
+
     def umount(self):
         raise NotImplementedError()
 
index 70e2dd147f76148ef22e10240ce47d514a8e2057..7e33ba0d48a9073de70b0a48f277145b69bbb255 100644 (file)
@@ -55,6 +55,7 @@ vstart_runner.py -
 """
 
 from io import StringIO
+from json import loads
 from collections import defaultdict
 import getpass
 import signal
@@ -647,6 +648,12 @@ class LocalCephFSMount():
         self.fs.wait_for_daemons()
         log.info('Ready to start {}...'.format(type(self).__name__))
 
+    def is_blocked(self):
+        self.fs = LocalFilesystem(self.ctx, name=self.cephfs_name)
+
+        output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls')
+        return self.addr in output
+
 
 class LocalKernelMount(LocalCephFSMount, KernelMount):
     def __init__(self, ctx, test_dir, client_id=None,
@@ -661,6 +668,21 @@ class LocalKernelMount(LocalCephFSMount, KernelMount):
         # Make vstart_runner compatible with teuth and qa/tasks/cephfs.
         self._mount_bin = [os.path.join(BIN_PREFIX , 'mount.ceph')]
 
+    def get_global_addr(self):
+        self.get_global_inst()
+        self.addr = self.inst[self.inst.find(' ') + 1 : ]
+        return self.addr
+
+    def get_global_inst(self):
+        clients = self.client_remote.run(
+            args=f'{CEPH_CMD} tell mds.* session ls',
+            stdout=StringIO()).stdout.getvalue()
+        clients = loads(clients)
+        for c in clients:
+            if c['id'] == self.id:
+                self.inst = c['inst']
+                return self.inst
+
 
 class LocalFuseMount(LocalCephFSMount, FuseMount):
     def __init__(self, ctx, test_dir, client_id, client_keyring_path=None,