From 8fa12844fb3499e26c65a8b976064a6954aa7f97 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Sat, 16 Jan 2021 01:37:14 +0530 Subject: [PATCH] mgr/volumes: Evict clients based on auth-IDs and subvolume mounted Add subvolume evict command which evicts the subvolume mounts which are mounted using particular auth-ID. Fixes: https://tracker.ceph.com/issues/44928 Signed-off-by: Kotresh HR (cherry picked from commit 269adcc8b8ab0742ba741ed7c2b59ccfb17a63f9) Conflicts: qa/tasks/cephfs/test_volumes.py: Few of the tests are re-organized, hence the conflicts. Resolved the same. --- doc/cephfs/fs-volumes.rst | 4 + qa/tasks/cephfs/test_volumes.py | 68 +++++++++++ src/pybind/mgr/volumes/fs/exception.py | 26 ++++ .../mgr/volumes/fs/operations/rankevicter.py | 114 ++++++++++++++++++ .../mgr/volumes/fs/operations/template.py | 2 + .../fs/operations/versions/subvolume_v1.py | 51 +++++++- .../mgr/volumes/fs/operations/volume.py | 11 ++ src/pybind/mgr/volumes/fs/volume.py | 28 ++++- src/pybind/mgr/volumes/module.py | 19 +++ 9 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 src/pybind/mgr/volumes/fs/operations/rankevicter.py diff --git a/doc/cephfs/fs-volumes.rst b/doc/cephfs/fs-volumes.rst index e81b1df6a5c8c..01510d02c26d1 100644 --- a/doc/cephfs/fs-volumes.rst +++ b/doc/cephfs/fs-volumes.rst @@ -167,6 +167,10 @@ List cephx auth IDs authorized to access fs subvolume:: $ ceph fs subvolume authorized_list [--group_name=] +Evict fs clients based on auth ID and subvolume mounted:: + + $ ceph fs subvolume evict [--group_name=] + Fetch the absolute path of a subvolume using:: $ ceph fs subvolume getpath [--group_name ] diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index f2a042674a6fb..e89c50402c089 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -1907,6 +1907,74 @@ class TestVolumes(CephFSTestCase): self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group) self._fs_cmd("subvolumegroup", "rm", self.volname, group) + def test_subvolume_evict_client(self): + """ + That a subvolume client can be evicted based on the auth ID + """ + + subvolumes = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # mounts[0] and mounts[1] would be used as guests to mount the volumes/shares. + for i in range(0, 2): + self.mounts[i].umount_wait() + guest_mounts = (self.mounts[0], self.mounts[1]) + auth_id = "guest" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # Create two subvolumes. Authorize 'guest' auth ID to mount the two + # subvolumes. Mount the two subvolumes. Write data to the volumes. + for i in range(2): + # Create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolumes[i], "--group_name", group) + + # authorize guest authID read-write access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolumes[i], guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolumes[i], + "--group_name", group).rstrip() + # configure credentials for guest client + self._configure_guest_auth(guest_mounts[i], auth_id, key) + + # mount the subvolume, and write to it + guest_mounts[i].mount(mount_path=mount_path) + guest_mounts[i].write_n_mb("data.bin", 1) + + # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted + # one volume. + self._fs_cmd("subvolume", "evict", self.volname, subvolumes[0], auth_id, "--group_name", group) + + # Evicted guest client, guest_mounts[0], should not be able to do + # anymore metadata ops. It should start failing all operations + # when it sees that its own address is in the blocklist. + try: + guest_mounts[0].write_n_mb("rogue.bin", 1) + except CommandFailedError: + pass + else: + raise RuntimeError("post-eviction write should have failed!") + + # The blocklisted guest client should now be unmountable + guest_mounts[0].umount_wait() + + # Guest client, guest_mounts[1], using the same auth ID 'guest', but + # has mounted the other volume, should be able to use its volume + # unaffected. + guest_mounts[1].write_n_mb("data.bin.1", 1) + + # Cleanup. + guest_mounts[1].umount_wait() + for i in range(2): + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolumes[i], auth_id, "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolumes[i], "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) def test_subvolume_group_create_with_invalid_data_pool_layout(self): group = self._generate_random_group_name() diff --git a/src/pybind/mgr/volumes/fs/exception.py b/src/pybind/mgr/volumes/fs/exception.py index de0b19109f354..4f903b99ceef2 100644 --- a/src/pybind/mgr/volumes/fs/exception.py +++ b/src/pybind/mgr/volumes/fs/exception.py @@ -35,3 +35,29 @@ class OpSmException(Exception): class NotImplementedException(Exception): pass + +class ClusterTimeout(Exception): + """ + Exception indicating that we timed out trying to talk to the Ceph cluster, + either to the mons, or to any individual daemon that the mons indicate ought + to be up but isn't responding to us. + """ + pass + +class ClusterError(Exception): + """ + Exception indicating that the cluster returned an error to a command that + we thought should be successful based on our last knowledge of the cluster + state. + """ + def __init__(self, action, result_code, result_str): + self._action = action + self._result_code = result_code + self._result_str = result_str + + def __str__(self): + return "Error {0} (\"{1}\") while {2}".format( + self._result_code, self._result_str, self._action) + +class EvictionError(Exception): + pass diff --git a/src/pybind/mgr/volumes/fs/operations/rankevicter.py b/src/pybind/mgr/volumes/fs/operations/rankevicter.py new file mode 100644 index 0000000000000..5b945c3894257 --- /dev/null +++ b/src/pybind/mgr/volumes/fs/operations/rankevicter.py @@ -0,0 +1,114 @@ +import errno +import json +import logging +import threading +import time + +from .volume import get_mds_map +from ..exception import ClusterTimeout, ClusterError + +log = logging.getLogger(__name__) + +class RankEvicter(threading.Thread): + """ + Thread for evicting client(s) from a particular MDS daemon instance. + + This is more complex than simply sending a command, because we have to + handle cases where MDS daemons might not be fully up yet, and/or might + be transiently unresponsive to commands. + """ + class GidGone(Exception): + pass + + POLL_PERIOD = 5 + + def __init__(self, mgr, fs, client_spec, volname, rank, gid, mds_map, ready_timeout): + """ + :param client_spec: list of strings, used as filter arguments to "session evict" + pass ["id=123"] to evict a single client with session id 123. + """ + self.volname = volname + self.rank = rank + self.gid = gid + self._mds_map = mds_map + self._client_spec = client_spec + self._fs = fs + self._ready_timeout = ready_timeout + self._ready_waited = 0 + self.mgr = mgr + + self.success = False + self.exception = None + + super(RankEvicter, self).__init__() + + def _ready_to_evict(self): + if self._mds_map['up'].get("mds_{0}".format(self.rank), None) != self.gid: + log.info("Evicting {0} from {1}/{2}: rank no longer associated with gid, done.".format( + self._client_spec, self.rank, self.gid + )) + raise RankEvicter.GidGone() + + info = self._mds_map['info']["gid_{0}".format(self.gid)] + log.debug("_ready_to_evict: state={0}".format(info['state'])) + return info['state'] in ["up:active", "up:clientreplay"] + + def _wait_for_ready(self): + """ + Wait for that MDS rank to reach an active or clientreplay state, and + not be laggy. + """ + while not self._ready_to_evict(): + if self._ready_waited > self._ready_timeout: + raise ClusterTimeout() + + time.sleep(self.POLL_PERIOD) + self._ready_waited += self.POLL_PERIOD + self._mds_map = get_mds_map(self.mgr, self.volname) + + def _evict(self): + """ + Run the eviction procedure. Return true on success, false on errors. + """ + + # Wait til the MDS is believed by the mon to be available for commands + try: + self._wait_for_ready() + except self.GidGone: + return True + + # Then send it an evict + ret = -errno.ETIMEDOUT + while ret == -errno.ETIMEDOUT: + log.debug("mds_command: {0}, {1}".format( + "%s" % self.gid, ["session", "evict"] + self._client_spec + )) + ret, outb, outs = self._fs.mds_command( + "%s" % self.gid, + json.dumps({ + "prefix": "session evict", + "filters": self._client_spec + }), "") + log.debug("mds_command: complete {0} {1}".format(ret, outs)) + + # If we get a clean response, great, it's gone from that rank. + if ret == 0: + return True + elif ret == -errno.ETIMEDOUT: + # Oh no, the MDS went laggy (that's how libcephfs knows to emit this error) + self._mds_map = get_mds_map(self.mgr, self.volname) + try: + self._wait_for_ready() + except self.GidGone: + return True + else: + raise ClusterError("Sending evict to mds.{0}".format(self.gid), ret, outs) + + def run(self): + try: + self._evict() + except Exception as e: + self.success = False + self.exception = e + else: + self.success = True diff --git a/src/pybind/mgr/volumes/fs/operations/template.py b/src/pybind/mgr/volumes/fs/operations/template.py index 0ebf98c46ba95..d35ad0de021d1 100644 --- a/src/pybind/mgr/volumes/fs/operations/template.py +++ b/src/pybind/mgr/volumes/fs/operations/template.py @@ -58,6 +58,8 @@ class SubvolumeOpType(Enum): CLONE_INTERNAL = 'clone_internal' ALLOW_ACCESS = 'allow-access' DENY_ACCESS = 'deny-access' + AUTH_LIST = 'auth-list' + EVICT = 'evict' class SubvolumeTemplate(object): VERSION = None # type: int diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py index 71ece0729ebf6..b232b365840ba 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py @@ -17,10 +17,12 @@ from .subvolume_base import SubvolumeBase from ..template import SubvolumeTemplate from ..snapshot_util import mksnap, rmsnap from ..access import allow_access, deny_access -from ...exception import IndexException, OpSmException, VolumeException, MetadataMgrException +from ...exception import IndexException, OpSmException, VolumeException, MetadataMgrException, EvictionError from ...fs_util import listsnaps, is_inherited_snap from ..template import SubvolumeOpType from ..group import Group +from ..rankevicter import RankEvicter +from ..volume import get_mds_map from ..clone_index import open_clone_index, create_clone_index @@ -583,6 +585,53 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate): return auths + def evict(self, volname, auth_id, timeout=30): + """ + Evict all clients based on the authorization ID and the subvolume path mounted. + Assumes that the authorization key has been revoked prior to calling this function. + + This operation can throw an exception if the mon cluster is unresponsive, or + any individual MDS daemon is unresponsive for longer than the timeout passed in. + """ + + client_spec = ["auth_name={0}".format(auth_id), ] + client_spec.append("client_metadata.root={0}". + format(self.path.decode('utf-8'))) + + log.info("evict clients with {0}".format(', '.join(client_spec))) + + mds_map = get_mds_map(self.mgr, volname) + if not mds_map: + raise VolumeException(-errno.ENOENT, "mdsmap for volume {0} not found".format(volname)) + + up = {} + for name, gid in mds_map['up'].items(): + # Quirk of the MDSMap JSON dump: keys in the up dict are like "mds_0" + assert name.startswith("mds_") + up[int(name[4:])] = gid + + # For all MDS ranks held by a daemon + # Do the parallelism in python instead of using "tell mds.*", because + # the latter doesn't give us per-mds output + threads = [] + for rank, gid in up.items(): + thread = RankEvicter(self.mgr, self.fs, client_spec, volname, rank, gid, mds_map, timeout) + thread.start() + threads.append(thread) + + for t in threads: + t.join() + + log.info("evict: joined all") + + for t in threads: + if not t.success: + msg = ("Failed to evict client with {0} from mds {1}/{2}: {3}". + format(', '.join(client_spec), t.rank, t.gid, t.exception) + ) + log.error(msg) + raise EvictionError(msg) + def _get_clone_source(self): try: clone_source = { diff --git a/src/pybind/mgr/volumes/fs/operations/volume.py b/src/pybind/mgr/volumes/fs/operations/volume.py index f4814fff5d59d..410e5c44b6b21 100644 --- a/src/pybind/mgr/volumes/fs/operations/volume.py +++ b/src/pybind/mgr/volumes/fs/operations/volume.py @@ -201,6 +201,17 @@ def gen_pool_names(volname): """ return "cephfs.{}.meta".format(volname), "cephfs.{}.data".format(volname) +def get_mds_map(mgr, volname): + """ + return mdsmap for a volname + """ + mds_map = None + fs_map = mgr.get("fs_map") + for f in fs_map['filesystems']: + if volname == f['mdsmap']['fs_name']: + return f['mdsmap'] + return mds_map + def get_pool_names(mgr, volname): """ return metadata and data pools (list) names of volume as a tuple diff --git a/src/pybind/mgr/volumes/fs/volume.py b/src/pybind/mgr/volumes/fs/volume.py index 7ad52ff1fded6..ab2875396a795 100644 --- a/src/pybind/mgr/volumes/fs/volume.py +++ b/src/pybind/mgr/volumes/fs/volume.py @@ -14,7 +14,7 @@ from .operations.subvolume import open_subvol, create_subvol, remove_subvol, \ create_clone from .vol_spec import VolSpec -from .exception import VolumeException +from .exception import VolumeException, ClusterError, ClusterTimeout, EvictionError from .async_cloner import Cloner from .purge_queue import ThreadPoolPurgeQueueMixin from .operations.template import SubvolumeOpType @@ -253,13 +253,37 @@ class VolumeClient(object): try: with open_volume(self, volname) as fs_handle: with open_group(fs_handle, self.volspec, groupname) as group: - with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.ALLOW_ACCESS) as subvolume: + with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.AUTH_LIST) as subvolume: auths = subvolume.authorized_list() ret = 0, json.dumps(auths, indent=4, sort_keys=True), "" except VolumeException as ve: ret = self.volume_exception_to_retval(ve) return ret + def evict(self, **kwargs): + ret = 0, "", "" + volname = kwargs['vol_name'] + subvolname = kwargs['sub_name'] + authid = kwargs['auth_id'] + groupname = kwargs['group_name'] + + try: + with open_volume(self, volname) as fs_handle: + with open_group(fs_handle, self.volspec, groupname) as group: + with open_subvol(self.mgr, fs_handle, self.volspec, group, subvolname, SubvolumeOpType.EVICT) as subvolume: + key = subvolume.evict(volname, authid) + ret = 0, "", "" + except (VolumeException, ClusterTimeout, ClusterError, EvictionError) as e: + if isinstance(e, VolumeException): + ret = self.volume_exception_to_retval(e) + elif isinstance(e, ClusterTimeout): + ret = -errno.ETIMEDOUT , "", "Timedout trying to talk to ceph cluster" + elif isinstance(e, ClusterError): + ret = e._result_code , "", e._result_str + elif isinstance(e, EvictionError): + ret = -errno.EINVAL, "", str(e) + return ret + def resize_subvolume(self, **kwargs): ret = 0, "", "" volname = kwargs['vol_name'] diff --git a/src/pybind/mgr/volumes/module.py b/src/pybind/mgr/volumes/module.py index 54f77b4b20e19..3b9ed4a9545bf 100644 --- a/src/pybind/mgr/volumes/module.py +++ b/src/pybind/mgr/volumes/module.py @@ -144,6 +144,15 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): 'desc': "List auth IDs that have access to a subvolume", 'perm': 'r' }, + { + 'cmd': 'fs subvolume evict ' + 'name=vol_name,type=CephString ' + 'name=sub_name,type=CephString ' + 'name=auth_id,type=CephString ' + 'name=group_name,type=CephString,req=false ', + 'desc': "Evict clients based on auth IDs and subvolume mounted", + 'perm': 'rw' + }, { 'cmd': 'fs subvolumegroup getpath ' 'name=vol_name,type=CephString ' @@ -455,6 +464,16 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): sub_name=cmd['sub_name'], group_name=cmd.get('group_name', None)) + @mgr_cmd_wrap + def _cmd_fs_subvolume_evict(self, inbuf, cmd): + """ + :return: a 3-tuple of return code(int), empyt string(str), error message (str) + """ + return self.vc.evict(vol_name=cmd['vol_name'], + sub_name=cmd['sub_name'], + auth_id=cmd['auth_id'], + group_name=cmd.get('group_name', None)) + @mgr_cmd_wrap def _cmd_fs_subvolume_ls(self, inbuf, cmd): return self.vc.list_subvolumes(vol_name=cmd['vol_name'], -- 2.39.5