From a16b0ebc3628ed6be85c7804b7330047eb9cba79 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Wed, 31 Jan 2024 17:27:35 -0700 Subject: [PATCH] Remove nuke: deletions This commit contains only full file deletions, and the relocation of nuke.actions.clear_firewall() to nuke/__init__.py to retain compatibility with older ceph.git tasks. Signed-off-by: Zack Cerza --- docs/commands/teuthology-nuke.rst | 4 - roles/3-simple.yaml | 4 - roles/overrides.yaml | 10 - scripts/nuke.py | 47 -- scripts/test/test_nuke.py | 5 - teuthology/nuke/__init__.py | 372 +------------- teuthology/nuke/actions.py | 459 ------------------ .../openstack/test/suites/nuke/nuke.yaml | 8 - teuthology/test/test_nuke.py | 276 ----------- 9 files changed, 12 insertions(+), 1173 deletions(-) delete mode 100644 docs/commands/teuthology-nuke.rst delete mode 100644 roles/3-simple.yaml delete mode 100644 roles/overrides.yaml delete mode 100644 scripts/nuke.py delete mode 100644 scripts/test/test_nuke.py delete mode 100644 teuthology/nuke/actions.py delete mode 100644 teuthology/openstack/test/suites/nuke/nuke.yaml delete mode 100644 teuthology/test/test_nuke.py diff --git a/docs/commands/teuthology-nuke.rst b/docs/commands/teuthology-nuke.rst deleted file mode 100644 index 77ec0b89e..000000000 --- a/docs/commands/teuthology-nuke.rst +++ /dev/null @@ -1,4 +0,0 @@ -teuthology-nuke -=============== - -.. program-output:: teuthology-nuke --help diff --git a/roles/3-simple.yaml b/roles/3-simple.yaml deleted file mode 100644 index ac2b3917a..000000000 --- a/roles/3-simple.yaml +++ /dev/null @@ -1,4 +0,0 @@ -roles: -- [mon.a, mds.a, osd.0] -- [mon.b, mds.a-s, osd.1] -- [mon.c, client.0] diff --git a/roles/overrides.yaml b/roles/overrides.yaml deleted file mode 100644 index e93a2b239..000000000 --- a/roles/overrides.yaml +++ /dev/null @@ -1,10 +0,0 @@ -nuke-on-error: true -kernel: - branch: main -overrides: - ceph: - branch: BRANCH_NAME - log-ignorelist: - - 'clocks not synchronized' -tasks: -- chef: diff --git a/scripts/nuke.py b/scripts/nuke.py deleted file mode 100644 index 0b1644c3e..000000000 --- a/scripts/nuke.py +++ /dev/null @@ -1,47 +0,0 @@ -import docopt - -import teuthology.nuke - -doc = """ -usage: - teuthology-nuke --help - teuthology-nuke [-v] [--owner OWNER] [-n NAME] [-u] [-i] [-r|-R] [-s] [-k] - [-p PID] [--dry-run] (-t CONFIG... | -a DIR) - teuthology-nuke [-v] [-u] [-i] [-r] [-s] [--dry-run] --owner OWNER --stale - teuthology-nuke [-v] [--dry-run] --stale-openstack - -Reset test machines - -optional arguments: - -h, --help show this help message and exit - -v, --verbose be more verbose - -t CONFIG [CONFIG ...], --targets CONFIG [CONFIG ...] - yaml config containing machines to nuke - -a DIR, --archive DIR - archive path for a job to kill and nuke - --stale attempt to find and nuke 'stale' machines - (e.g. locked by jobs that are no longer running) - --stale-openstack nuke 'stale' OpenStack instances and volumes - and unlock OpenStack targets with no instance - --dry-run Don't actually nuke anything; just print the list of - targets that would be nuked - --owner OWNER job owner - -p PID, --pid PID pid of the process to be killed - -r, --reboot-all reboot all machines (default) - -R, --no-reboot do not reboot the machines - -s, --synch-clocks synchronize clocks on all machines - -u, --unlock Unlock each successfully nuked machine, and output - targets thatcould not be nuked. - -n NAME, --name NAME Name of run to cleanup - -i, --noipmi Skip ipmi checking - -k, --keep-logs Preserve test directories and logs on the machines - -Examples: -teuthology-nuke -t target.yaml --unlock --owner user@host -teuthology-nuke -t target.yaml --pid 1234 --unlock --owner user@host -""" - - -def main(): - args = docopt.docopt(doc) - teuthology.nuke.main(args) diff --git a/scripts/test/test_nuke.py b/scripts/test/test_nuke.py deleted file mode 100644 index fa615c466..000000000 --- a/scripts/test/test_nuke.py +++ /dev/null @@ -1,5 +0,0 @@ -from script import Script - - -class TestNuke(Script): - script_name = 'teuthology-nuke' diff --git a/teuthology/nuke/__init__.py b/teuthology/nuke/__init__.py index 8a2985b9e..9c6eefe18 100644 --- a/teuthology/nuke/__init__.py +++ b/teuthology/nuke/__init__.py @@ -1,368 +1,20 @@ -import argparse -import datetime -import json import logging -import os -import subprocess - -import yaml - -import teuthology - -from teuthology import provision -from teuthology.lock import ops as lock_ops -from teuthology.lock import util -from teuthology.lock.query import is_vm, list_locks, \ - find_stale_locks, get_status -from teuthology.nuke.actions import ( - check_console, clear_firewall, shutdown_daemons, remove_installed_packages, - reboot, remove_osd_mounts, remove_osd_tmpfs, kill_hadoop, - remove_ceph_packages, synch_clocks, unlock_firmware_repo, - remove_configuration_files, undo_multipath, reset_syslog_dir, - remove_ceph_data, remove_testing_tree, remove_yum_timedhosts, - kill_valgrind, -) -from teuthology.config import config, FakeNamespace -from teuthology.misc import ( - canonicalize_hostname, config_file, decanonicalize_hostname, merge_configs, - get_user, sh -) -from teuthology.openstack import OpenStack, OpenStackInstance, enforce_json_dictionary -from teuthology.orchestra import remote -from teuthology.parallel import parallel -from teuthology.task import internal -from teuthology.task.internal import check_lock log = logging.getLogger(__name__) -def openstack_volume_id(volume): - return (volume.get('ID') or volume['id']) - - -def openstack_volume_name(volume): - return (volume.get('Display Name') or - volume.get('display_name') or - volume.get('Name') or - volume.get('name') or "") - - -def stale_openstack(ctx): - targets = dict(map(lambda i: (i['ID'], i), - OpenStack.list_instances())) - nodes = list_locks(keyed_by_name=True, locked=True) - stale_openstack_instances(ctx, targets, nodes) - stale_openstack_nodes(ctx, targets, nodes) - stale_openstack_volumes(ctx, OpenStack.list_volumes()) - if not ctx.dry_run: - openstack_remove_again() - -# -# A delay, in seconds, that is significantly longer than -# any kind of OpenStack server creation / deletion / etc. -# -OPENSTACK_DELAY = 30 * 60 - - -def stale_openstack_instances(ctx, instances, locked_nodes): - for (instance_id, instance) in instances.items(): - i = OpenStackInstance(instance_id) - if not i.exists(): - log.debug("stale-openstack: {instance} disappeared, ignored" - .format(instance=instance_id)) - continue - if (i.get_created() > - config['max_job_time'] + OPENSTACK_DELAY): - log.info( - "stale-openstack: destroying instance {instance}" - " because it was created {created} seconds ago" - " which is older than" - " max_job_time {max_job_time} + {delay}" - .format(instance=i['name'], - created=i.get_created(), - max_job_time=config['max_job_time'], - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - i.destroy() - continue - name = canonicalize_hostname(i['name'], user=None) - if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes: - log.info("stale-openstack: destroying instance {instance}" - " because it was created {created} seconds ago" - " is older than {delay}s and it is not locked" - .format(instance=i['name'], - created=i.get_created(), - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - i.destroy() - continue - log.debug("stale-openstack: instance " + i['name'] + " OK") - - -def openstack_delete_volume(id): - OpenStack().run("volume delete " + id + " || true") - - -def stale_openstack_volumes(ctx, volumes): - now = datetime.datetime.now() - for volume in volumes: - volume_id = openstack_volume_id(volume) - try: - volume = json.loads(OpenStack().run("volume show -f json " + - volume_id)) - except subprocess.CalledProcessError: - log.debug("stale-openstack: {id} disappeared, ignored" - .format(id=volume_id)) - continue - volume_name = openstack_volume_name(volume) - enforce_json_dictionary(volume) - created_at = datetime.datetime.strptime( - volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f') - created = (now - created_at).total_seconds() - if created > config['max_job_time'] + OPENSTACK_DELAY: - log.info( - "stale-openstack: destroying volume {volume}({id})" - " because it was created {created} seconds ago" - " which is older than" - " max_job_time {max_job_time} + {delay}" - .format(volume=volume_name, - id=volume_id, - created=created, - max_job_time=config['max_job_time'], - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - openstack_delete_volume(volume_id) - continue - log.debug("stale-openstack: volume " + volume_id + " OK") - - -def stale_openstack_nodes(ctx, instances, locked_nodes): - names = set([ i['Name'] for i in instances.values() ]) - for (name, node) in locked_nodes.items(): - name = decanonicalize_hostname(name) - if node['machine_type'] != 'openstack': - continue - if (name not in names and - util.locked_since_seconds(node) > OPENSTACK_DELAY): - log.info("stale-openstack: unlocking node {name} unlocked" - " because it was created {created}" - " seconds ago which is older than {delay}" - " and it has no instance" - .format(name=name, - created=util.locked_since_seconds(node), - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - lock_ops.unlock_one(ctx, name, node['locked_by']) - continue - log.debug("stale-openstack: node " + name + " OK") - - -def openstack_remove_again(): +# This is being kept because ceph.git/qa/tasks/cephfs/filesystem.py references it. +def clear_firewall(ctx): """ - Volumes and servers with REMOVE-ME in the name are leftover - that failed to be removed. It is not uncommon for a failed removal - to succeed later on. + Remove any iptables rules created by teuthology. These rules are + identified by containing a comment with 'teuthology' in it. Non-teuthology + firewall rules are unaffected. """ - sh(""" - openstack server list --name REMOVE-ME --column ID --format value | - xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait - true - """) - volumes = json.loads(OpenStack().run("volume list -f json --long")) - remove_me = [openstack_volume_id(v) for v in volumes - if 'REMOVE-ME' in openstack_volume_name(v)] - for i in remove_me: - log.info("Trying to remove stale volume %s" % i) - openstack_delete_volume(i) - - -def main(args): - ctx = FakeNamespace(args) - if ctx.verbose: - teuthology.log.setLevel(logging.DEBUG) - - info = {} - if ctx.archive: - ctx.config = config_file(ctx.archive + '/config.yaml') - ifn = os.path.join(ctx.archive, 'info.yaml') - if os.path.exists(ifn): - with open(ifn, 'r') as fd: - info = yaml.safe_load(fd.read()) - if not ctx.pid: - ctx.pid = info.get('pid') - if not ctx.pid: - ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n')) - if not ctx.owner: - ctx.owner = info.get('owner') - if not ctx.owner: - ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n') - - if ctx.targets: - ctx.config = merge_configs(ctx.targets) - - if ctx.stale: - stale_nodes = find_stale_locks(ctx.owner) - targets = dict() - for node in stale_nodes: - targets[node['name']] = node['ssh_pub_key'] - ctx.config = dict(targets=targets) - - if ctx.stale_openstack: - stale_openstack(ctx) - return - - log.info( - '\n '.join( - ['targets:', ] + yaml.safe_dump( - ctx.config['targets'], - default_flow_style=False).splitlines())) - - if ctx.dry_run: - log.info("Not actually nuking anything since --dry-run was passed") - return - - if ctx.owner is None: - ctx.owner = get_user() - - if ctx.pid: - if ctx.archive: - log.info('Killing teuthology process at pid %d', ctx.pid) - os.system('grep -q %s /proc/%d/cmdline && sudo kill -9 %d' % ( - ctx.archive, - ctx.pid, - ctx.pid)) - else: - subprocess.check_call(["kill", "-9", str(ctx.pid)]) - - nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.noipmi, ctx.keep_logs, not ctx.no_reboot) - - -def nuke(ctx, should_unlock, sync_clocks=True, noipmi=False, keep_logs=False, should_reboot=True): - if 'targets' not in ctx.config: - return - total_unnuked = {} - log.info('Checking targets against current locks') - with parallel() as p: - for target, hostkey in ctx.config['targets'].items(): - status = get_status(target) - if ctx.name and ctx.name not in (status.get('description') or ""): - total_unnuked[target] = hostkey - log.info( - f"Not nuking {target} because description doesn't match: " - f"{ctx.name} != {status.get('description')}" - ) - continue - elif status.get('up') is False: - total_unnuked[target] = hostkey - log.info(f"Not nuking {target} because it is down") - continue - p.spawn( - nuke_one, - ctx, - {target: hostkey}, - should_unlock, - sync_clocks, - ctx.config.get('check-locks', True), - noipmi, - keep_logs, - should_reboot, - ) - for unnuked in p: - if unnuked: - total_unnuked.update(unnuked) - if total_unnuked: - log.error('Could not nuke the following targets:\n' + - '\n '.join(['targets:', ] + - yaml.safe_dump( - total_unnuked, - default_flow_style=False).splitlines())) - - -def nuke_one(ctx, target, should_unlock, synch_clocks, - check_locks, noipmi, keep_logs, should_reboot): - ret = None - ctx = argparse.Namespace( - config=dict(targets=target), - owner=ctx.owner, - check_locks=check_locks, - synch_clocks=synch_clocks, - teuthology_config=config.to_dict(), - name=ctx.name, - noipmi=noipmi, + log.info("Clearing teuthology firewall rules...") + ctx.cluster.run( + args=[ + "sudo", "sh", "-c", + "iptables-save | grep -v teuthology | iptables-restore" + ], ) - try: - nuke_helper(ctx, should_unlock, keep_logs, should_reboot) - except Exception: - log.exception('Could not nuke %s' % target) - # not re-raising the so that parallel calls aren't killed - ret = target - else: - if should_unlock: - lock_ops.unlock_one(ctx, list(target.keys())[0], ctx.owner) - return ret - - -def nuke_helper(ctx, should_unlock, keep_logs, should_reboot): - # ensure node is up with ipmi - (target,) = ctx.config['targets'].keys() - host = target.split('@')[-1] - shortname = host.split('.')[0] - if should_unlock: - if is_vm(shortname): - return - log.debug('shortname: %s' % shortname) - remote_ = remote.Remote(host) - if ctx.check_locks: - # does not check to ensure if the node is 'up' - # we want to be able to nuke a downed node - check_lock.check_lock(ctx, None, check_up=False) - status = get_status(host) - if status['machine_type'] in provision.fog.get_types(): - remote_.console.power_off() - return - elif status['machine_type'] in provision.pelagos.get_types(): - provision.pelagos.park_node(host) - return - elif remote_.is_container: - remote_.run( - args=['sudo', '/testnode_stop.sh'], - check_status=False, - ) - return - if (not ctx.noipmi and 'ipmi_user' in config and - 'vpm' not in shortname): - try: - check_console(host) - except Exception: - log.exception('') - log.info("Will attempt to connect via SSH") - remote_ = remote.Remote(host) - remote_.connect() - internal.add_remotes(ctx, None) - internal.connect(ctx, None) - clear_firewall(ctx) - shutdown_daemons(ctx) - kill_valgrind(ctx) - # Try to remove packages before reboot - remove_installed_packages(ctx) - remotes = ctx.cluster.remotes.keys() - if should_reboot: - reboot(ctx, remotes) - # shutdown daemons again incase of startup - shutdown_daemons(ctx) - remove_osd_mounts(ctx) - remove_osd_tmpfs(ctx) - kill_hadoop(ctx) - remove_ceph_packages(ctx) - synch_clocks(remotes) - unlock_firmware_repo(ctx) - remove_configuration_files(ctx) - undo_multipath(ctx) - reset_syslog_dir(ctx) - remove_ceph_data(ctx) - if not keep_logs: - remove_testing_tree(ctx) - remove_yum_timedhosts(ctx) - # Once again remove packages after reboot - remove_installed_packages(ctx) - log.info('Installed packages removed.') + log.info("Cleared teuthology firewall rules.") diff --git a/teuthology/nuke/actions.py b/teuthology/nuke/actions.py deleted file mode 100644 index 621b088c9..000000000 --- a/teuthology/nuke/actions.py +++ /dev/null @@ -1,459 +0,0 @@ -import logging -import time - -from teuthology.misc import get_testdir, reconnect -from teuthology.orchestra import remote as remote_mod, run -from teuthology.task import install as install_task - - -log = logging.getLogger(__name__) - - -def clear_firewall(ctx): - """ - Remove any iptables rules created by teuthology. These rules are - identified by containing a comment with 'teuthology' in it. Non-teuthology - firewall rules are unaffected. - """ - log.info("Clearing teuthology firewall rules...") - ctx.cluster.run( - args=[ - "sudo", "sh", "-c", - "iptables-save | grep -v teuthology | iptables-restore" - ], - ) - log.info("Cleared teuthology firewall rules.") - - -def shutdown_daemons(ctx): - log.info('Unmounting ceph-fuse and killing daemons...') - ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'ceph.target'], - check_status=False, timeout=180) - ctx.cluster.run( - args=[ - 'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'), - 'then', - 'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'), - 'grep', '-o', " /.* fuse", run.Raw('|'), - 'grep', '-o', "/.* ", run.Raw('|'), - 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), - 'fi', - run.Raw(';'), - 'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'), - 'then', - 'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'), - 'grep', '-o', " /.* fuse", run.Raw('|'), - 'grep', '-o', "/.* ", run.Raw('|'), - 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), - 'fi', - run.Raw(';'), - 'sudo', - 'killall', - '--quiet', - 'ceph-mon', - 'ceph-osd', - 'ceph-mds', - 'ceph-mgr', - 'ceph-fuse', - 'ceph-disk', - 'radosgw', - 'ceph_test_rados', - 'rados', - 'rbd-fuse', - 'apache2', - run.Raw('||'), - 'true', # ignore errors from ceph binaries not being found - ], - timeout=120, - ) - log.info('All daemons killed.') - - -def kill_hadoop(ctx): - log.info("Terminating Hadoop services...") - ctx.cluster.run(args=[ - "pkill", "-f", "-KILL", "java.*hadoop", - ], - check_status=False, - timeout=60 - ) - - -def kill_valgrind(ctx): - # http://tracker.ceph.com/issues/17084 - ctx.cluster.run( - args=['sudo', 'pkill', '-f', '-9', 'valgrind.bin'], - check_status=False, - timeout=20, - ) - - -def remove_osd_mounts(ctx): - """ - unmount any osd data mounts (scratch disks) - """ - log.info('Unmount any osd data directories...') - ctx.cluster.run( - args=[ - 'grep', - '/var/lib/ceph/osd/', - '/etc/mtab', - run.Raw('|'), - 'awk', '{print $2}', run.Raw('|'), - 'xargs', '-r', - 'sudo', 'umount', '-l', run.Raw(';'), - 'true' - ], - timeout=120 - ) - - -def remove_osd_tmpfs(ctx): - """ - unmount tmpfs mounts - """ - log.info('Unmount any osd tmpfs dirs...') - ctx.cluster.run( - args=[ - 'egrep', r'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'), - 'awk', '{print $2}', run.Raw('|'), - 'xargs', '-r', - 'sudo', 'umount', run.Raw(';'), - 'true' - ], - timeout=120 - ) - - -def stale_kernel_mount(remote): - proc = remote.run( - args=[ - 'sudo', 'find', - '/sys/kernel/debug/ceph', - '-mindepth', '1', - run.Raw('!'), - '-path', '/sys/kernel/debug/ceph/meta', - run.Raw('!'), - '-path', '/sys/kernel/debug/ceph/meta/client_features', - '-type', 'd', - run.Raw('|'), - 'read' - ], - check_status=False - ) - return proc.exitstatus == 0 - - -def reboot(ctx, remotes): - for remote in remotes: - if stale_kernel_mount(remote): - log.warning('Stale kernel mount on %s!', remote.name) - log.info('force/no-sync rebooting %s', remote.name) - # -n is ignored in systemd versions through v229, which means this - # only works on trusty -- on 7.3 (v219) and xenial (v229) reboot -n - # still calls sync(). - # args = ['sync', run.Raw('&'), - # 'sleep', '5', run.Raw(';'), - # 'sudo', 'reboot', '-f', '-n'] - args = ['for', 'sysrq', 'in', 's', 'u', 'b', run.Raw(';'), - 'do', 'echo', run.Raw('$sysrq'), run.Raw('|'), - 'sudo', 'tee', '/proc/sysrq-trigger', run.Raw(';'), - 'done'] - else: - log.info('rebooting %s', remote.name) - args = ['sudo', 'reboot'] - try: - remote.run(args=args, wait=False) - except Exception: - log.exception('ignoring exception during reboot command') - # we just ignore these procs because reboot -f doesn't actually - # send anything back to the ssh client! - if remotes: - log.info('waiting for nodes to reboot') - time.sleep(8) # if we try and reconnect too quickly, it succeeds! - reconnect(ctx, 480) # allow 8 minutes for the reboots - - -def reset_syslog_dir(ctx): - log.info('Resetting syslog output locations...') - nodes = {} - for remote in ctx.cluster.remotes.keys(): - proc = remote.run( - args=[ - 'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf', - run.Raw(';'), - 'then', - 'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf', - run.Raw('&&'), - 'sudo', 'service', 'rsyslog', 'restart', - run.Raw(';'), - 'fi', - run.Raw(';'), - ], - timeout=60, - ) - nodes[remote.name] = proc - - for name, proc in nodes.items(): - log.info('Waiting for %s to restart syslog...', name) - proc.wait() - - -def dpkg_configure(ctx): - for remote in ctx.cluster.remotes.keys(): - if remote.os.package_type != 'deb': - continue - log.info( - 'Waiting for dpkg --configure -a and apt-get -f install...') - remote.run( - args=[ - 'sudo', 'dpkg', '--configure', '-a', - run.Raw(';'), - 'sudo', 'DEBIAN_FRONTEND=noninteractive', - 'apt-get', '-y', '--force-yes', '-f', 'install', - run.Raw('||'), - ':', - ], - timeout=180, - check_status=False, - ) - - -def remove_yum_timedhosts(ctx): - # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329 - log.info("Removing yum timedhosts files...") - for remote in ctx.cluster.remotes.keys(): - if remote.os.package_type != 'rpm': - continue - remote.run( - args=r"sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;", - check_status=False, timeout=180 - ) - - -def remove_ceph_packages(ctx): - """ - remove ceph and ceph dependent packages by force - force is needed since the node's repo might have changed and - in many cases autocorrect will not work due to missing packages - due to repo changes - """ - log.info("Force remove ceph packages") - ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd', - 'libcephfs1', 'libcephfs2', - 'librados2', 'librgw2', 'librbd1', 'python-rgw', - 'ceph-selinux', 'python-cephfs', 'ceph-base', - 'python-rbd', 'python-rados', 'ceph-mds', - 'ceph-mgr', 'libcephfs-java', 'libcephfs-jni', - 'ceph-deploy', 'libapache2-mod-fastcgi' - ] - pkgs = str.join(' ', ceph_packages_to_remove) - for remote in ctx.cluster.remotes.keys(): - if remote.os.package_type == 'rpm': - log.info("Remove any broken repos") - dist_release = remote.os.name - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")], - check_status=False - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*samba*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*nfs-ganesha*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rpm', '--rebuilddb'] - ) - if dist_release in ['opensuse', 'sle']: - remote.sh('sudo zypper clean') - log.info('Remove any ceph packages') - remote.sh('sudo zypper remove --non-interactive', - check_status=False - ) - else: - remote.sh('sudo yum clean all') - log.info('Remove any ceph packages') - remote.sh('sudo yum remove -y', check_status=False) - else: - log.info("Remove any broken repos") - remote.run( - args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*samba*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*nfs-ganesha*")], - check_status=False, - ) - log.info("Autoclean") - remote.run( - args=['sudo', 'apt-get', 'autoclean'], - check_status=False, - ) - log.info('Remove any ceph packages') - remote.run( - args=[ - 'sudo', 'dpkg', '--remove', '--force-remove-reinstreq', - run.Raw(pkgs) - ], - check_status=False - ) - log.info("Autoclean") - remote.run( - args=['sudo', 'apt-get', 'autoclean'] - ) - - -def remove_installed_packages(ctx): - dpkg_configure(ctx) - conf = dict( - project='ceph', - debuginfo='true', - ) - packages = install_task.get_package_list(ctx, conf) - debs = packages['deb'] + \ - ['salt-common', 'salt-minion', 'calamari-server', - 'python-rados', 'multipath-tools'] - rpms = packages['rpm'] + \ - ['salt-common', 'salt-minion', 'calamari-server', - 'multipath-tools', 'device-mapper-multipath'] - install_task.remove_packages( - ctx, - conf, - dict( - deb=debs, - rpm=rpms, - ) - ) - install_task.remove_sources(ctx, conf) - - -def remove_ceph_data(ctx): - log.info("Removing any stale ceph data...") - ctx.cluster.run( - args=[ - 'sudo', 'rm', '-rf', '/etc/ceph', - run.Raw('/var/run/ceph*'), - ], - ) - - -def remove_testing_tree(ctx): - log.info('Clearing filesystem of test data...') - ctx.cluster.run( - args=[ - 'sudo', 'rm', '-rf', get_testdir(ctx), - # just for old time's sake - run.Raw('&&'), - 'sudo', 'rm', '-rf', '/tmp/cephtest', - run.Raw('&&'), - 'sudo', 'rm', '-rf', '/home/ubuntu/cephtest', - ], - ) - - -def remove_configuration_files(ctx): - """ - Goes through a list of commonly used configuration files used for testing - that should not be left behind. - - For example, sometimes ceph-deploy may be configured via - ``~/.cephdeploy.conf`` to alter how it handles installation by specifying - a default section in its config with custom locations. - """ - ctx.cluster.run( - args=[ - 'rm', '-f', '/home/ubuntu/.cephdeploy.conf' - ], - timeout=30 - ) - - -def undo_multipath(ctx): - """ - Undo any multipath device mappings created, an - remove the packages/daemon that manages them so they don't - come back unless specifically requested by the test. - """ - log.info('Removing any multipath config/pkgs...') - for remote in ctx.cluster.remotes.keys(): - remote.run( - args=[ - 'sudo', 'multipath', '-F', - ], - check_status=False, - timeout=60 - ) - - -def synch_clocks(remotes): - log.info('Synchronizing clocks...') - for remote in remotes: - remote.run( - args=[ - 'sudo', 'systemctl', 'stop', 'ntp.service', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'ntpd.service', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'chronyd.service', - run.Raw('&&'), - 'sudo', 'ntpdate-debian', run.Raw('||'), - 'sudo', 'ntp', '-gq', run.Raw('||'), - 'sudo', 'ntpd', '-gq', run.Raw('||'), - 'sudo', 'chronyc', 'sources', - run.Raw('&&'), - 'sudo', 'hwclock', '--systohc', '--utc', - run.Raw('&&'), - 'sudo', 'systemctl', 'start', 'ntp.service', run.Raw('||'), - 'sudo', 'systemctl', 'start', 'ntpd.service', run.Raw('||'), - 'sudo', 'systemctl', 'start', 'chronyd.service', - run.Raw('||'), - 'true', # ignore errors; we may be racing with ntpd startup - ], - timeout=60, - ) - - -def unlock_firmware_repo(ctx): - log.info('Making sure firmware.git is not locked...') - ctx.cluster.run(args=['sudo', 'rm', '-f', - '/lib/firmware/updates/.git/index.lock', ]) - - -def check_console(hostname): - remote = remote_mod.Remote(hostname) - shortname = remote.shortname - console = remote.console - if not console: - return - cname = '{host}.{domain}'.format( - host=shortname, - domain=console.ipmidomain, - ) - log.info('checking console status of %s' % cname) - if console.check_status(): - log.info('console ready on %s' % cname) - return - if console.check_power('on'): - log.info('attempting to reboot %s' % cname) - console.power_cycle() - else: - log.info('attempting to power on %s' % cname) - console.power_on() - timeout = 100 - log.info('checking console status of %s with timeout %s' % - (cname, timeout)) - if console.check_status(timeout=timeout): - log.info('console ready on %s' % cname) - else: - log.error("Failed to get console status for %s, " % cname) diff --git a/teuthology/openstack/test/suites/nuke/nuke.yaml b/teuthology/openstack/test/suites/nuke/nuke.yaml deleted file mode 100644 index 9ffd7ac5c..000000000 --- a/teuthology/openstack/test/suites/nuke/nuke.yaml +++ /dev/null @@ -1,8 +0,0 @@ -stop_worker: true -nuke-on-error: true -roles: -- - client.0 -tasks: -- exec: - client.0: - - exit 1 diff --git a/teuthology/test/test_nuke.py b/teuthology/test/test_nuke.py deleted file mode 100644 index b061d89b4..000000000 --- a/teuthology/test/test_nuke.py +++ /dev/null @@ -1,276 +0,0 @@ -import datetime -import json -import os -import pytest -import subprocess - -from unittest.mock import patch, Mock, DEFAULT, ANY - -from teuthology import nuke -from teuthology import misc -from teuthology.config import config -from teuthology.dispatcher.supervisor import create_fake_context - -class TestNuke(object): - - #@pytest.mark.skipif('OS_AUTH_URL' not in os.environ, - # reason="no OS_AUTH_URL environment variable") - def test_stale_openstack_volumes(self): - ctx = Mock() - ctx.teuthology_config = config - ctx.dry_run = False - now = datetime.datetime.strftime(datetime.datetime.now(), - "%Y-%m-%dT%H:%M:%S.000000") - id = '4bee3af9-febb-40c1-a17e-ff63edb415c5' - name = 'target1-0' - volume_list = json.loads( - '[{' - ' "ID": "' + id + '"' - '}]' - ) - # - # A volume created a second ago is left untouched - # - volume_show = ( - '{"id": "' + id + '", ' - '"created_at": "' + now + '", ' - '"display_name": "' + name + '"}' - ) - - with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol: - with patch.object(nuke.OpenStack, 'run') as m_os_run: - m_os_run.return_value = volume_show - nuke.stale_openstack_volumes(ctx, volume_list) - m_os_del_vol.assert_not_called() - - - # - # A volume created long ago is destroyed - # - ancient = "2000-11-02T15:43:12.000000" - volume_show = ( - '{"id": "' + id + '", ' - '"created_at": "' + ancient + '", ' - '"display_name": "' + name + '"}' - ) - - with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol: - with patch.object(nuke.OpenStack, 'run') as m_os_run: - m_os_run.return_value = volume_show - nuke.stale_openstack_volumes(ctx, volume_list) - m_os_del_vol.assert_called_with(id) - - # - # A volume that no longer exists is ignored - # - with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol: - with patch.object(nuke.OpenStack, 'run') as m_os_run: - m_os_run.side_effect = subprocess.CalledProcessError('ERROR', 'FAIL') - nuke.stale_openstack_volumes(ctx, volume_list) - m_os_del_vol.assert_not_called() - - def test_stale_openstack_nodes(self): - ctx = Mock() - ctx.teuthology_config = config - ctx.dry_run = False - name = 'target1' - uuid = 'UUID1' - now = datetime.datetime.strftime(datetime.datetime.now(), - "%Y-%m-%d %H:%M:%S.%f") - # - # A node is not of type openstack is left untouched - # - with patch("teuthology.lock.ops.unlock_one") as m_unlock_one: - nuke.stale_openstack_nodes( - ctx, - {}, - {name: {'locked_since': now, 'machine_type': 'mira'}}, - ) - m_unlock_one.assert_not_called() - # - # A node that was just locked and does not have - # an instance yet is left untouched - # - with patch("teuthology.lock.ops.unlock_one") as m_unlock_one: - nuke.stale_openstack_nodes( - ctx, - {}, - {name: {'locked_since': now, 'machine_type': 'openstack'}}, - ) - m_unlock_one.assert_not_called() - # - # A node that has been locked for some time and - # has no instance is unlocked. - # - ancient = "2000-11-02 15:43:12.000000" - me = 'loic@dachary.org' - with patch("teuthology.lock.ops.unlock_one") as m_unlock_one: - nuke.stale_openstack_nodes( - ctx, - {}, - {name: {'locked_since': ancient, 'locked_by': me, 'machine_type': 'openstack'}}, - ) - m_unlock_one.assert_called_with(ctx, name, me) - # - # A node that has been locked for some time and - # has an instance is left untouched - # - with patch("teuthology.lock.ops.unlock_one") as m_unlock_one: - nuke.stale_openstack_nodes( - ctx, - {uuid: {'ID': uuid, 'Name': name}}, - {name: {'locked_since': ancient, 'machine_type': 'openstack'}}, - ) - m_unlock_one.assert_not_called() - - def test_stale_openstack_instances(self): - if 'OS_AUTH_URL' not in os.environ: - pytest.skip('no OS_AUTH_URL environment variable') - ctx = Mock() - ctx.teuthology_config = config - ctx.dry_run = False - name = 'target1' - uuid = 'UUID1' - # - # An instance created a second ago is left untouched, - # even when it is not locked. - # - with patch.multiple( - nuke.OpenStackInstance, - exists=lambda _: True, - get_created=lambda _: 1, - __getitem__=lambda _, key: name, - destroy=DEFAULT, - ) as m: - nuke.stale_openstack_instances(ctx, { - uuid: { 'Name': name, }, - }, { - }) - m['destroy'].assert_not_called() - # - # An instance created a very long time ago is destroyed - # - with patch.multiple( - nuke.OpenStackInstance, - exists=lambda _: True, - get_created=lambda _: 1000000000, - __getitem__=lambda _, key: name, - destroy=DEFAULT, - ) as m: - nuke.stale_openstack_instances(ctx, { - uuid: { 'Name': name, }, - }, { - misc.canonicalize_hostname(name, user=None): {}, - }) - m['destroy'].assert_called_with() - # - # An instance that turns out to not exist any longer - # is ignored. - # - with patch.multiple( - nuke.OpenStackInstance, - exists=lambda _: False, - __getitem__=lambda _, key: name, - destroy=DEFAULT, - ) as m: - nuke.stale_openstack_instances(ctx, { - uuid: { 'Name': name, }, - }, { - misc.canonicalize_hostname(name, user=None): {}, - }) - m['destroy'].assert_not_called() - # - # An instance created but not locked after a while is - # destroyed. - # - with patch.multiple( - nuke.OpenStackInstance, - exists=lambda _: True, - get_created=lambda _: nuke.OPENSTACK_DELAY + 1, - __getitem__=lambda _, key: name, - destroy=DEFAULT, - ) as m: - nuke.stale_openstack_instances(ctx, { - uuid: { 'Name': name, }, - }, { - }) - m['destroy'].assert_called_with() - # - # An instance created within the expected lifetime - # of a job and locked is left untouched. - # - with patch.multiple( - nuke.OpenStackInstance, - exists=lambda _: True, - get_created=lambda _: nuke.OPENSTACK_DELAY + 1, - __getitem__=lambda _, key: name, - destroy=DEFAULT, - ) as m: - nuke.stale_openstack_instances(ctx, { - uuid: { 'Name': name, }, - }, { - misc.canonicalize_hostname(name, user=None): {}, - }) - m['destroy'].assert_not_called() - - -@patch("teuthology.lock.ops.unlock_one") -def test_nuke_internal(m_unlock_one): - job_config = dict( - owner='test_owner', - targets={'user@host1': 'key1', 'user@host2': 'key2'}, - archive_path='/path/to/test_run', - machine_type='test_machine', - os_type='centos', - os_version='8.3', - name='test_name', - ) - statuses = { - target: {'name': target, 'description': job_config['name']} - for target in job_config['targets'].keys() - } - ctx = create_fake_context(job_config) - - # minimal call using defaults - with patch.multiple( - nuke, - nuke_helper=DEFAULT, - get_status=lambda i: statuses[i], - ) as m: - nuke.nuke(ctx, True) - m['nuke_helper'].assert_called_with(ANY, True, False, True) - m_unlock_one.assert_called() - m_unlock_one.reset_mock() - - # don't unlock - with patch.multiple( - nuke, - nuke_helper=DEFAULT, - get_status=lambda i: statuses[i], - ) as m: - nuke.nuke(ctx, False) - m['nuke_helper'].assert_called_with(ANY, False, False, True) - m_unlock_one.assert_not_called() - m_unlock_one.reset_mock() - - # mimicing what teuthology-dispatcher --supervisor does - with patch.multiple( - nuke, - nuke_helper=DEFAULT, - get_status=lambda i: statuses[i], - ) as m: - nuke.nuke(ctx, False, True, False, True, False) - m['nuke_helper'].assert_called_with(ANY, False, True, False) - m_unlock_one.assert_not_called() - m_unlock_one.reset_mock() - - # no targets - del ctx.config['targets'] - with patch.multiple( - nuke, - nuke_helper=DEFAULT, - get_status=lambda i: statuses[i], - ) as m: - nuke.nuke(ctx, True) - m['nuke_helper'].assert_not_called() - m_unlock_one.assert_not_called() -- 2.47.3