From: Xiubo Li Date: Tue, 3 Mar 2020 13:31:29 +0000 (-0500) Subject: qa/vstart_runner: add unsharing network namespace support X-Git-Tag: v16.1.0~2551^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f0c67256b9ccffc8071ea346b7620f6188f35e9d;p=ceph.git qa/vstart_runner: add unsharing network namespace support This will isolate the network namespace for each mount point with a private ip address and iptables, etc. For the kill() stuff it will just do DOWN the veth interface instead of sending ipmi request for kernel mount and kill the fuse processes for the fuse mount. This could avoid sending the socket FIN to the ceph cluster. Fixes: https://tracker.ceph.com/issues/44044 Signed-off-by: Xiubo Li --- diff --git a/doc/dev/developer_guide/running-tests-locally.rst b/doc/dev/developer_guide/running-tests-locally.rst index f80a6f52b339..6a1f6e09758a 100644 --- a/doc/dev/developer_guide/running-tests-locally.rst +++ b/doc/dev/developer_guide/running-tests-locally.rst @@ -102,6 +102,8 @@ vstart_runner.py can take the following options - --teardown tears Ceph cluster down after test(s) has finished runnng --kclient use the kernel cephfs client instead of FUSE +--brxnet= specify a new net/mask for the mount clients' network + namespace container (Default: 192.168.0.0/16) .. note:: If using the FUSE client, ensure that the fuse package is installed and enabled on the system and that ``user_allow_other`` is added diff --git a/qa/cephfs/begin.yaml b/qa/cephfs/begin.yaml index 0f3beb8fbd03..b06a4b27a58a 100644 --- a/qa/cephfs/begin.yaml +++ b/qa/cephfs/begin.yaml @@ -13,6 +13,9 @@ tasks: - flex - libelf-dev - libssl-dev + - network-manager + - iproute2 + - util-linux # for xfstests-dev - dump - indent @@ -21,6 +24,9 @@ tasks: - flex - elfutils-libelf-devel - openssl-devel + - NetworkManager + - iproute + - util-linux # for xfstests-dev - libacl-devel - libaio-devel diff --git a/qa/tasks/ceph_fuse.py b/qa/tasks/ceph_fuse.py index 1439ccffd566..34dd36da10a6 100644 --- a/qa/tasks/ceph_fuse.py +++ b/qa/tasks/ceph_fuse.py @@ -42,12 +42,16 @@ def task(ctx, config): this operation on. This lets you e.g. set up one client with ``ceph-fuse`` and another with ``kclient``. + ``brxnet`` should be a Private IPv4 Address range, default range is + [192.168.0.0/16] + Example that mounts all clients:: tasks: - ceph: - ceph-fuse: - interactive: + - brxnet: [192.168.0.0/16] Example that uses both ``kclient` and ``ceph-fuse``:: @@ -106,6 +110,8 @@ def task(ctx, config): mounted_by_me = {} skipped = {} + brxnet = config.get("brxnet", None) + # Construct any new FuseMount instances for id_, remote in clients: client_config = config.get("client.%s" % id_) @@ -120,7 +126,7 @@ def task(ctx, config): continue if id_ not in all_mounts: - fuse_mount = FuseMount(ctx, client_config, testdir, auth_id, remote) + fuse_mount = FuseMount(ctx, client_config, testdir, auth_id, remote, brxnet) all_mounts[id_] = fuse_mount else: # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py index c23e1f54caf2..ac4da5b360d4 100644 --- a/qa/tasks/cephfs/fuse_mount.py +++ b/qa/tasks/cephfs/fuse_mount.py @@ -2,7 +2,7 @@ from io import BytesIO import json import time import logging - +import re import six from textwrap import dedent @@ -17,8 +17,8 @@ log = logging.getLogger(__name__) class FuseMount(CephFSMount): - def __init__(self, ctx, client_config, test_dir, client_id, client_remote): - super(FuseMount, self).__init__(ctx, test_dir, client_id, client_remote) + def __init__(self, ctx, client_config, test_dir, client_id, client_remote, brxnet): + super(FuseMount, self).__init__(ctx, test_dir, client_id, client_remote, brxnet) self.client_config = client_config if client_config else {} self.fuse_daemon = None @@ -31,6 +31,7 @@ class FuseMount(CephFSMount): if mountpoint is not None: self.mountpoint = mountpoint self.setupfs(name=mount_fs_name) + self.setup_netns() try: return self._mount(mount_path, mount_fs_name, mount_options) @@ -91,6 +92,10 @@ class FuseMount(CephFSMount): ) cwd = None # misc.get_valgrind_args chdir for us + netns_prefix = ['sudo', 'nsenter', + '--net=/var/run/netns/{0}'.format(self.netns_name)] + run_cmd = netns_prefix + run_cmd + run_cmd.extend(fuse_cmd) def list_connections(): @@ -161,6 +166,8 @@ class FuseMount(CephFSMount): self.gather_mount_info() + self.mounted = True + def gather_mount_info(self): status = self.admin_socket(['status']) self.id = status['id'] @@ -247,7 +254,7 @@ class FuseMount(CephFSMount): try: log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) self.client_remote.run( - args=[ + args = [ 'sudo', 'fusermount', '-u', @@ -329,6 +336,7 @@ class FuseMount(CephFSMount): try: # Permit a timeout, so that we do not block forever run.wait([self.fuse_daemon], timeout) + except MaxWhileTries: log.error("process failed to terminate after unmount. This probably" " indicates a bug within ceph-fuse.") @@ -337,50 +345,8 @@ class FuseMount(CephFSMount): if require_clean: raise - self.cleanup() - - def cleanup(self): - """ - Remove the mount point. - - Prerequisite: the client is not mounted. - """ - stderr = BytesIO() - try: - self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - cwd=self.test_dir, - stderr=stderr, - timeout=(60*5), - check_status=False, - ) - except CommandFailedError: - if b"No such file or directory" in stderr.getvalue(): - pass - else: - raise - - def kill(self): - """ - Terminate the client without removing the mount point. - """ - log.info('Killing ceph-fuse connection on {name}...'.format(name=self.client_remote.name)) - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - - def kill_cleanup(self): - """ - Follow up ``kill`` to get to a clean unmounted state. - """ - log.info('Cleaning up killed ceph-fuse connection') - self.umount() + self.cleanup_netns() + self.mounted = False self.cleanup() def teardown(self): @@ -398,6 +364,9 @@ class FuseMount(CephFSMount): except CommandFailedError: pass + self.cleanup_netns() + self.mounted = False + # Indiscriminate, unlike the touchier cleanup() self.client_remote.run( args=[ diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py index 38d44ab04b99..2dc07ca973c3 100644 --- a/qa/tasks/cephfs/kernel_mount.py +++ b/qa/tasks/cephfs/kernel_mount.py @@ -17,19 +17,16 @@ UMOUNT_TIMEOUT = 300 class KernelMount(CephFSMount): - def __init__(self, ctx, test_dir, client_id, client_remote, - ipmi_user, ipmi_password, ipmi_domain): - super(KernelMount, self).__init__(ctx, test_dir, client_id, client_remote) + def __init__(self, ctx, test_dir, client_id, client_remote, brxnet): + super(KernelMount, self).__init__(ctx, test_dir, client_id, client_remote, brxnet) self.mounted = False - self.ipmi_user = ipmi_user - self.ipmi_password = ipmi_password - self.ipmi_domain = ipmi_domain def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None, mount_options=[]): if mountpoint is not None: self.mountpoint = mountpoint self.setupfs(name=mount_fs_name) + self.setup_netns() log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) @@ -55,6 +52,8 @@ class KernelMount(CephFSMount): 'adjust-ulimits', 'ceph-coverage', '{tdir}/archive/coverage'.format(tdir=self.test_dir), + 'nsenter', + '--net=/var/run/netns/{0}'.format(self.netns_name), '/bin/mount', '-t', 'ceph', @@ -91,19 +90,9 @@ class KernelMount(CephFSMount): ], timeout=(15*60)) raise e - rproc = self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - wait=False - ) - run.wait([rproc], UMOUNT_TIMEOUT) self.mounted = False - - def cleanup(self): - pass + self.cleanup_netns() + self.cleanup() def umount_wait(self, force=False, require_clean=False, timeout=900): """ @@ -118,10 +107,20 @@ class KernelMount(CephFSMount): if not force: raise - self.kill() - self.kill_cleanup() - - self.mounted = False + # force delete the netns and umount + self.cleanup_netns() + self.client_remote.run( + args=['sudo', + 'umount', + '-f', + '-l', + self.mountpoint + ], + timeout=(15*60)) + + self.mounted = False + self.cleanup_netns() + self.cleanup() def is_mounted(self): return self.mounted @@ -138,57 +137,6 @@ class KernelMount(CephFSMount): if self.mounted: self.umount() - def kill(self): - """ - The Ceph kernel client doesn't have a mechanism to kill itself (doing - that in side the kernel would be weird anyway), so we reboot the whole node - to get the same effect. - - We use IPMI to reboot, because we don't want the client to send any - releases of capabilities. - """ - - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.hard_reset(wait_for_login=False) - - self.mounted = False - - def kill_cleanup(self): - assert not self.mounted - - # We need to do a sleep here because we don't know how long it will - # take for a hard_reset to be effected. - time.sleep(30) - - try: - # Wait for node to come back up after reboot - misc.reconnect(None, 300, [self.client_remote]) - except: - # attempt to get some useful debug output: - con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, - self.ipmi_user, - self.ipmi_password, - self.ipmi_domain) - con.check_status(timeout=60) - raise - - # Remove mount directory - self.client_remote.run(args=['uptime'], timeout=10) - - # Remove mount directory - self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - timeout=(5*60), - check_status=False, - ) - def _find_debug_dir(self): """ Find the debugfs folder for this mount diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py index d3cb0a429edd..d913dcbcb64f 100644 --- a/qa/tasks/cephfs/mount.py +++ b/qa/tasks/cephfs/mount.py @@ -8,15 +8,18 @@ import time from six import StringIO from textwrap import dedent import os +import re +from IPy import IP from teuthology.orchestra import run from teuthology.orchestra.run import CommandFailedError, ConnectionLostError from tasks.cephfs.filesystem import Filesystem +import platform log = logging.getLogger(__name__) class CephFSMount(object): - def __init__(self, ctx, test_dir, client_id, client_remote): + def __init__(self, ctx, test_dir, client_id, client_remote, brxnet): """ :param test_dir: Global teuthology test dir :param client_id: Client ID, the 'foo' in client.foo @@ -30,11 +33,32 @@ class CephFSMount(object): self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id) self._mountpoint = None self.fs = None + self._netns_name = None + self.nsid = -1 + if brxnet is None: + self.ceph_brx_net = '192.168.0.0/16' + else: + self.ceph_brx_net = brxnet self.test_files = ['a', 'b', 'c'] self.background_procs = [] + # On Centos/Redhat 8 the 'brctl' has been deprecated. But the + # 'nmcli' in ubuntu 18.04 is buggy to setup the network bridge + # and there is no workaround, will continue to use the 'brctl' + args = ["bash", "-c", + "cat /etc/os-release"] + p = self.client_remote.run(args=args, stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + distro = re.findall(r'NAME="Ubuntu"', p.stdout.getvalue()) + version = re.findall(r'VERSION_ID="18.04"', p.stdout.getvalue()) + self.use_brctl = len(distro) is not 0 and len(version) is not 0 + + def _parse_netns_name(self): + self._netns_name = '-'.join(["ceph-ns", + re.sub(r'/+', "-", self.mountpoint)]) + @property def mountpoint(self): if self._mountpoint == None: @@ -47,6 +71,19 @@ class CephFSMount(object): if not isinstance(path, str): raise RuntimeError('path should be of str type.') self._mountpoint = path + self._parse_netns_name() + + @property + def netns_name(self): + if self._netns_name == None: + self._parse_netns_name() + return self._netns_name + + @netns_name.setter + def netns_name(self, name): + if not isinstance(path, str): + raise RuntimeError('path should be of str type.') + self._netns_name = name def is_mounted(self): raise NotImplementedError() @@ -60,6 +97,320 @@ class CephFSMount(object): self.fs.wait_for_daemons() log.info('Ready to start {}...'.format(type(self).__name__)) + def _bringup_network_manager_service(self): + args = ["sudo", "bash", "-c", + "systemctl start NetworkManager"] + self.client_remote.run(args=args, timeout=(5*60)) + + def _setup_brx_and_nat(self): + # The ip for ceph-brx should be + ip = IP(self.ceph_brx_net)[-2] + mask = self.ceph_brx_net.split('/')[1] + brd = IP(self.ceph_brx_net).broadcast() + + brx = self.client_remote.run(args=['ip', 'addr'], stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue()) + if brx: + # If the 'ceph-brx' already exists, then check whether + # the new net is conflicting with it + _ip, _mask = brx[0].split()[1].split('/', 1) + if _ip != "{}".format(ip) or _mask != mask: + raise RuntimeError("Conflict with existing ceph-brx {0}, new {1}/{2}".format(brx[0].split()[1], ip, mask)) + return + + log.info("Setuping the 'ceph-brx' with {0}/{1}".format(ip, mask)) + + # Setup the ceph-brx and always use the last valid IP + if self.use_brctl == True: + args = ["sudo", "bash", "-c", "brctl addbr ceph-brx"] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", "ip link set ceph-brx up"] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip addr add {0}/{1} brd {2} dev ceph-brx".format(ip, mask, brd)] + self.client_remote.run(args=args, timeout=(5*60)) + else: + self._bringup_network_manager_service() + args = ["sudo", "bash", "-c", + "nmcli connection add type bridge con-name ceph-brx ifname ceph-brx stp no"] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "nmcli connection modify ceph-brx ipv4.addresses {0}/{1} ipv4.method manual".format(ip, mask)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", "nmcli connection up ceph-brx"] + self.client_remote.run(args=args, timeout=(5*60)) + + # Save the ip_forward + self.client_remote.run(args=['touch', '/tmp/python-ceph-brx'], + timeout=(5*60)) + p = self.client_remote.run(args=['cat', '/proc/sys/net/ipv4/ip_forward'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + val = p.stdout.getvalue().strip() + args = ["sudo", "bash", "-c", + "echo {} > /tmp/python-ceph-brx".format(val)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "echo 1 > /proc/sys/net/ipv4/ip_forward"] + self.client_remote.run(args=args, timeout=(5*60)) + + # Setup the NAT + p = self.client_remote.run(args=['route'], stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + p = re.findall(r'default .*', p.stdout.getvalue()) + if p == False: + raise RuntimeError("No default gw found") + gw = p[0].split()[7] + args = ["sudo", "bash", "-c", + "iptables -A FORWARD -o {0} -i ceph-brx -j ACCEPT".format(gw)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "iptables -A FORWARD -i {0} -o ceph-brx -j ACCEPT".format(gw)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "iptables -t nat -A POSTROUTING -s {0}/{1} -o {2} -j MASQUERADE".format(ip, mask, gw)] + self.client_remote.run(args=args, timeout=(5*60)) + + def _setup_netns(self): + p = self.client_remote.run(args=['ip', 'netns', 'list'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + p = p.stdout.getvalue().strip() + if re.match(self.netns_name, p) is not None: + raise RuntimeError("the netns '{}' already exists!".format(self.netns_name)) + + # Get the netns name list + netns_list = re.findall(r'[^()\s][-.\w]+[^():\s]', p) + + # Get an uniq netns id + nsid = 0 + while True: + p = self.client_remote.run(args=['ip', 'netns', 'list-id'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + p = re.search(r"nsid {} ".format(nsid), p.stdout.getvalue()) + if p is None: + break + + nsid += 1 + + self.nsid = nsid; + + # Add one new netns and set it id + args = ["sudo", "bash", "-c", + "ip netns add {0}".format(self.netns_name)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip netns set {0} {1}".format(self.netns_name, nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + + # Get one ip address for netns + ips = IP(self.ceph_brx_net) + for ip in ips: + found = False + if ip == ips[0]: + continue + if ip == ips[-2]: + raise RuntimeError("we have ran out of the ip addresses") + + for ns in netns_list: + ns_name = ns.split()[0] + args = ["sudo", "bash", "-c", + "ip netns exec {0} ip addr".format(ns_name)] + p = self.client_remote.run(args=args, stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + q = re.search("{0}".format(ip), p.stdout.getvalue()) + if q is not None: + found = True + break + + if found == False: + break + + mask = self.ceph_brx_net.split('/')[1] + brd = IP(self.ceph_brx_net).broadcast() + + log.info("Setuping the netns '{0}' with {1}/{2}".format(self.netns_name, ip, mask)) + + # Setup the veth interfaces + args = ["sudo", "bash", "-c", + "ip link add veth0 netns {0} type veth peer name brx.{1}".format(self.netns_name, nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip netns exec {0} ip addr add {1}/{2} brd {3} dev veth0".format(self.netns_name, ip, mask, brd)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip netns exec {0} ip link set veth0 up".format(self.netns_name)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip netns exec {0} ip link set lo up".format(self.netns_name)] + self.client_remote.run(args=args, timeout=(5*60)) + + brxip = IP(self.ceph_brx_net)[-2] + args = ["sudo", "bash", "-c", + "ip netns exec {0} ip route add default via {1}".format(self.netns_name, brxip)] + self.client_remote.run(args=args, timeout=(5*60)) + + # Bring up the brx interface and join it to 'ceph-brx' + if self.use_brctl == True: + args = ["sudo", "bash", "-c", + "ip link set brx.{0} up".format(nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "brctl addif ceph-brx brx.{0}".format(nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + else: + self._bringup_network_manager_service() + args = ["sudo", "bash", "-c", + "nmcli connection add type bridge-slave con-name brx.{0} ifname brx.{0} master ceph-brx".format(nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "nmcli connection up brx.{0}".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + + def _cleanup_netns(self): + if self.nsid == -1: + return + log.info("Removing the netns '{0}'".format(self.netns_name)) + + # Delete the netns and the peer veth interface + if self.use_brctl == True: + args = ["sudo", "bash", "-c", + "ip link set brx.{0} down".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "brctl delif ceph-brx brx.{0}".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "ip link delete brx.{0}".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + else: + self._bringup_network_manager_service() + args = ["sudo", "bash", "-c", + "nmcli connection down brx.{0}".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "nmcli connection delete brx.{0}".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + + args = ["sudo", "bash", "-c", + "ip netns delete {0}".format(self.netns_name)] + self.client_remote.run(args=args, timeout=(5*60)) + + self.nsid = -1 + + def _cleanup_brx_and_nat(self): + brx = self.client_remote.run(args=['ip', 'addr'], stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue()) + if not brx: + return + + # If we are the last netns, will delete the ceph-brx + if self.use_brctl == True: + p = self.client_remote.run(args=['brctl', 'show', 'ceph-brx'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + else: + self._bringup_network_manager_service() + p = self.client_remote.run(args=['nmcli', 'connection', 'show'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + _list = re.findall(r'brx\.', p.stdout.getvalue().strip()) + if len(_list) != 0: + return + + log.info("Removing the 'ceph-brx'") + + if self.use_brctl == True: + args = ["sudo", "bash", "-c", + "ip link set ceph-brx down"] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "brctl delbr ceph-brx"] + self.client_remote.run(args=args, timeout=(5*60)) + else: + args = ["sudo", "bash", "-c", + "nmcli connection down ceph-brx"] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "nmcli connection delete ceph-brx"] + self.client_remote.run(args=args, timeout=(5*60)) + + # Drop the iptables NAT rules + ip = IP(self.ceph_brx_net)[-2] + mask = self.ceph_brx_net.split('/')[1] + + p = self.client_remote.run(args=['route'], stderr=BytesIO(), + stdout=BytesIO(), timeout=(5*60)) + p = re.findall(r'default .*', p.stdout.getvalue()) + if p == False: + raise RuntimeError("No default gw found") + gw = p[0].split()[7] + args = ["sudo", "bash", "-c", + "iptables -D FORWARD -o {0} -i ceph-brx -j ACCEPT".format(gw)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "iptables -D FORWARD -i {0} -o ceph-brx -j ACCEPT".format(gw)] + self.client_remote.run(args=args, timeout=(5*60)) + args = ["sudo", "bash", "-c", + "iptables -t nat -D POSTROUTING -s {0}/{1} -o {2} -j MASQUERADE".format(ip, mask, gw)] + self.client_remote.run(args=args, timeout=(5*60)) + + # Restore the ip_forward + p = self.client_remote.run(args=['cat', '/tmp/python-ceph-brx'], + stderr=BytesIO(), stdout=BytesIO(), + timeout=(5*60)) + val = p.stdout.getvalue().strip() + args = ["sudo", "bash", "-c", + "echo {} > /proc/sys/net/ipv4/ip_forward".format(val)] + self.client_remote.run(args=args, timeout=(5*60)) + self.client_remote.run(args=['rm', '-f', '/tmp/python-ceph-brx'], + timeout=(5*60)) + + def setup_netns(self): + """ + Setup the netns for the mountpoint. + """ + log.info("Setting the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + self._setup_brx_and_nat() + self._setup_netns() + + def cleanup_netns(self): + """ + Cleanup the netns for the mountpoint. + """ + log.info("Cleaning the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + self._cleanup_netns() + self._cleanup_brx_and_nat() + + def suspend_netns(self): + """ + Suspend the netns veth interface. + """ + if self.nsid == -1: + return + + log.info("Suspending the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + + args = ["sudo", "bash", "-c", + "ip link set brx.{0} down".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + + def resume_netns(self): + """ + Resume the netns veth interface. + """ + if self.nsid == -1: + return + + log.info("Resuming the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + + args = ["sudo", "bash", "-c", + "ip link set brx.{0} up".format(self.nsid)] + self.client_remote.run(args=args, timeout=(5*60)) + def mount(self, mount_path=None, mount_fs_name=None, mountpoint=None, mount_options=[]): raise NotImplementedError() @@ -78,14 +429,46 @@ class CephFSMount(object): """ raise NotImplementedError() - def kill_cleanup(self): - raise NotImplementedError() - def kill(self): - raise NotImplementedError() + """ + Suspend the netns veth interface to make the client disconnected + from the ceph cluster + """ + log.info('Killing connection on {0}...'.format(self.client_remote.name)) + self.suspend_netns() + + def kill_cleanup(self): + """ + Follow up ``kill`` to get to a clean unmounted state. + """ + log.info('Cleaning up killed connection on {0}'.format(self.client_remote.name)) + self.umount_wait(force=True) + self.cleanup() def cleanup(self): - raise NotImplementedError() + """ + Remove the mount point. + + Prerequisite: the client is not mounted. + """ + stderr = BytesIO() + try: + self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + cwd=self.test_dir, + stderr=stderr, + timeout=(60*5), + check_status=False, + ) + except CommandFailedError: + if "No such file or directory" in stderr.getvalue(): + pass + else: + raise def wait_until_mounted(self): raise NotImplementedError() diff --git a/qa/tasks/kclient.py b/qa/tasks/kclient.py index 50d557f3ce75..efc6cb47ff28 100644 --- a/qa/tasks/kclient.py +++ b/qa/tasks/kclient.py @@ -22,12 +22,16 @@ def task(ctx, config): this operation on. This lets you e.g. set up one client with ``ceph-fuse`` and another with ``kclient``. + ``brxnet`` should be a Private IPv4 Address range, default range is + [192.168.0.0/16] + Example that mounts all clients:: tasks: - ceph: - kclient: - interactive: + - brxnet: [192.168.0.0/16] Example that uses both ``kclient` and ``ceph-fuse``:: @@ -86,9 +90,7 @@ def task(ctx, config): test_dir, id_, remote, - ctx.teuthology_config.get('ipmi_user', None), - ctx.teuthology_config.get('ipmi_password', None), - ctx.teuthology_config.get('ipmi_domain', None) + ctx.teuthology_config.get('brxnet', None), ) mounts[id_] = kernel_mount diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py index 919ac92c58b4..42e09f188f72 100644 --- a/qa/tasks/vstart_runner.py +++ b/qa/tasks/vstart_runner.py @@ -43,6 +43,7 @@ import os import time import sys import errno +from IPy import IP from unittest import suite, loader import unittest import platform @@ -514,8 +515,8 @@ def safe_kill(pid): class LocalKernelMount(KernelMount): - def __init__(self, ctx, test_dir, client_id): - super(LocalKernelMount, self).__init__(ctx, test_dir, client_id, LocalRemote(), None, None, None) + def __init__(self, ctx, test_dir, client_id, brxnet): + super(LocalKernelMount, self).__init__(ctx, test_dir, client_id, LocalRemote(), brxnet) @property def config_path(self): @@ -656,6 +657,7 @@ class LocalKernelMount(KernelMount): def mount(self, mount_path=None, mount_fs_name=None, mount_options=[]): self.setupfs(name=mount_fs_name) + self.setup_netns() log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) @@ -684,6 +686,8 @@ class LocalKernelMount(KernelMount): self.client_remote.run( args=[ 'sudo', + 'nsenter', + '--net=/var/run/netns/{0}'.format(self.netns_name), './bin/mount.ceph', ':{mount_path}'.format(mount_path=mount_path), self.mountpoint, @@ -709,8 +713,8 @@ class LocalKernelMount(KernelMount): wait=False) class LocalFuseMount(FuseMount): - def __init__(self, ctx, test_dir, client_id): - super(LocalFuseMount, self).__init__(ctx, None, test_dir, client_id, LocalRemote()) + def __init__(self, ctx, test_dir, client_id, brxnet): + super(LocalFuseMount, self).__init__(ctx, None, test_dir, client_id, LocalRemote(), brxnet) @property def config_path(self): @@ -823,6 +827,7 @@ class LocalFuseMount(FuseMount): if mountpoint is not None: self.mountpoint = mountpoint self.setupfs(name=mount_fs_name) + self.setup_netns() self.client_remote.run(args=['mkdir', '-p', self.mountpoint]) @@ -863,7 +868,9 @@ class LocalFuseMount(FuseMount): prefix += mount_options; self.fuse_daemon = self.client_remote.run(args= - prefix + [ + ['nsenter', + '--net=/var/run/netns/{0}'.format(self.netns_name), + ] + prefix + [ "-f", "--name", "client.{0}".format(self.client_id), @@ -1201,6 +1208,7 @@ class InteractiveFailureResult(unittest.TextTestResult): def enumerate_methods(s): log.info("e: {0}".format(s)) for t in s._tests: + print("t {0}, s._tests {1}".format(t, s._tests)) if isinstance(t, suite.BaseTestSuite): for sub in enumerate_methods(t): yield sub @@ -1232,7 +1240,10 @@ def scan_tests(modules): max_required_mgr = 0 require_memstore = False + print("module = {}".format(overall_suite)) for suite_, case in enumerate_methods(overall_suite): + print("suite {0}".format(suite_)) + print("case {0}".format(case)) max_required_mds = max(max_required_mds, getattr(case, "MDSS_REQUIRED", 0)) max_required_clients = max(max_required_clients, @@ -1310,6 +1321,7 @@ def exec_test(): global opt_log_ps_output opt_log_ps_output = False use_kernel_client = False + opt_brxnet= None args = sys.argv[1:] flags = [a for a in args if a.startswith("-")] @@ -1331,6 +1343,18 @@ def exec_test(): clear_old_log() elif f == "--kclient": use_kernel_client = True + elif '--brxnet' in f: + if re.search(r'=[0-9./]+', f) is None: + log.error("--brxnet= option needs one argument: '{0}'".format(f)) + sys.exit(-1) + opt_brxnet=f.split('=')[1] + try: + IP(opt_brxnet) + if IP(opt_brxnet).iptype() is 'PUBLIC': + raise RuntimeError('is public') + except Exception as e: + log.error("Invalid ip '{0}' {1}".format(opt_brxnet, e)) + sys.exit(-1) else: log.error("Unknown option '{0}'".format(f)) sys.exit(-1) @@ -1415,9 +1439,9 @@ def exec_test(): open("./keyring", "a").write(p.stdout.getvalue()) if use_kernel_client: - mount = LocalKernelMount(ctx, test_dir, client_id) + mount = LocalKernelMount(ctx, test_dir, client_id, opt_brxnet) else: - mount = LocalFuseMount(ctx, test_dir, client_id) + mount = LocalFuseMount(ctx, test_dir, client_id, opt_brxnet) mounts.append(mount) if os.path.exists(mount.mountpoint):