From: Rishabh Dave Date: Wed, 20 Jul 2022 09:46:53 +0000 (+0530) Subject: qa/cephfs: lazy-force unmount clients that have been evicted X-Git-Tag: v18.0.0~119^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c279b47ec90367ed34255d8a4eb35c24f9453c86;p=ceph.git qa/cephfs: lazy-force unmount clients that have been evicted Before unmounting check if the client has been evicted and, if so, run "umount -f -l" for the mount point of the client and cleanup the mount right after it. Attempting to unmount, cleanup or operate in any way over mount point of a evicted client will hang the operation (and thereby our Python code too). Lazy-force unmount prevents such hangs for our Python code and also frees the mount point. This commit also adds code to gather session info for kernel mounts after mounting is successful. This is a necessity since network address of session is needed to check if it is blocked by Ceph cluster. Fixes: https://tracker.ceph.com/issues/56476 Signed-off-by: Rishabh Dave --- diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py index fa0d84b842b3..7f8b11017ec1 100644 --- a/qa/tasks/cephfs/fuse_mount.py +++ b/qa/tasks/cephfs/fuse_mount.py @@ -312,6 +312,11 @@ class FuseMount(CephFSMount): if cleanup: self.cleanup() return + if self.is_blocked(): + self._run_umount_lf() + if cleanup: + self.cleanup() + return try: log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) @@ -344,15 +349,8 @@ class FuseMount(CephFSMount): """).format(self._fuse_conn)) self._fuse_conn = None - stderr = StringIO() # make sure its unmounted - try: - self.client_remote.run( - args=['sudo', 'umount', '-l', '-f', self.hostfs_mntpt], - stderr=stderr, timeout=UMOUNT_TIMEOUT, omit_sudo=False) - except CommandFailedError: - if self.is_mounted(): - raise + self._run_umount_lf() self._fuse_conn = None self.id = None @@ -386,6 +384,11 @@ class FuseMount(CephFSMount): # mount -o remount (especially if the remount is stuck because MDSs # are unavailable) + if self.is_blocked(): + self._run_umount_lf() + self.cleanup() + return + # cleanup is set to to fail since clieanup must happen after umount is # complete; otherwise following call to run.wait hangs. self.umount(cleanup=False) diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py index c74e637d4416..e1948926701d 100644 --- a/qa/tasks/cephfs/kernel_mount.py +++ b/qa/tasks/cephfs/kernel_mount.py @@ -62,6 +62,13 @@ class KernelMount(CephFSMount): self.enable_dynamic_debug() self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1 + self.gather_mount_info() + + def gather_mount_info(self): + self.id = self._get_global_id() + self.get_global_inst() + self.get_global_addr() + def _run_mount_cmd(self, mntopts, check_status): mount_cmd = self._get_mount_cmd(mntopts) mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO() @@ -133,6 +140,11 @@ class KernelMount(CephFSMount): self.cleanup() return + if self.is_blocked(): + self._run_umount_lf() + self.cleanup() + return + log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) try: @@ -173,11 +185,7 @@ class KernelMount(CephFSMount): raise # force delete the netns and umount - log.debug('Force/lazy unmounting on client.{id}...'.format(id=self.client_id)) - self.client_remote.run(args=['sudo', 'umount', '-f', '-l', - self.mountpoint], timeout=timeout, - omit_sudo=False) - + self._run_umount_lf() self.cleanup() def wait_until_mounted(self): diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py index 89dbe5111208..874b27f4f60d 100644 --- a/qa/tasks/cephfs/mount.py +++ b/qa/tasks/cephfs/mount.py @@ -168,6 +168,12 @@ class CephFSMount(object): get_file(self.client_remote, self.client_keyring_path, sudo=True).decode()) + def is_blocked(self): + self.fs = Filesystem(self.ctx, name=self.cephfs_name) + + output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls') + return self.addr in output + def is_stuck(self): """ Check if mount is stuck/in a hanged state. @@ -473,6 +479,19 @@ class CephFSMount(object): self.mount(**kwargs) self.wait_until_mounted() + def _run_umount_lf(self): + log.debug(f'Force/lazy unmounting on client.{self.client_id}') + + try: + proc = self.client_remote.run( + args=f'sudo umount --lazy --force {self.hostfs_mntpt}', + timeout=UMOUNT_TIMEOUT, omit_sudo=False) + except CommandFailedError: + if self.is_mounted(): + raise + + return proc + def umount(self): raise NotImplementedError() diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index 70e2dd147f76..7e33ba0d48a9 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -55,6 +55,7 @@ vstart_runner.py - """ from io import StringIO +from json import loads from collections import defaultdict import getpass import signal @@ -647,6 +648,12 @@ class LocalCephFSMount(): self.fs.wait_for_daemons() log.info('Ready to start {}...'.format(type(self).__name__)) + def is_blocked(self): + self.fs = LocalFilesystem(self.ctx, name=self.cephfs_name) + + output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls') + return self.addr in output + class LocalKernelMount(LocalCephFSMount, KernelMount): def __init__(self, ctx, test_dir, client_id=None, @@ -661,6 +668,21 @@ class LocalKernelMount(LocalCephFSMount, KernelMount): # Make vstart_runner compatible with teuth and qa/tasks/cephfs. self._mount_bin = [os.path.join(BIN_PREFIX , 'mount.ceph')] + def get_global_addr(self): + self.get_global_inst() + self.addr = self.inst[self.inst.find(' ') + 1 : ] + return self.addr + + def get_global_inst(self): + clients = self.client_remote.run( + args=f'{CEPH_CMD} tell mds.* session ls', + stdout=StringIO()).stdout.getvalue() + clients = loads(clients) + for c in clients: + if c['id'] == self.id: + self.inst = c['inst'] + return self.inst + class LocalFuseMount(LocalCephFSMount, FuseMount): def __init__(self, ctx, test_dir, client_id, client_keyring_path=None,