Remove nuke: deletions

author Zack Cerza <zack@redhat.com>

Thu, 1 Feb 2024 00:27:35 +0000 (17:27 -0700)

committer Zack Cerza <zack@redhat.com>

Fri, 2 Feb 2024 18:47:11 +0000 (11:47 -0700)
author Zack Cerza <zack@redhat.com>
Thu, 1 Feb 2024 00:27:35 +0000 (17:27 -0700)
committer Zack Cerza <zack@redhat.com>
Fri, 2 Feb 2024 18:47:11 +0000 (11:47 -0700)
diff --git a/docs/commands/teuthology-nuke.rst b/docs/commands/teuthology-nuke.rst

deleted file mode 100644 (file)

index 77ec0b8..0000000
--- a/docs/commands/teuthology-nuke.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-teuthology-nuke
-===============
-
-.. program-output:: teuthology-nuke --help
diff --git a/roles/3-simple.yaml b/roles/3-simple.yaml

deleted file mode 100644 (file)

index ac2b391..0000000
--- a/roles/3-simple.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-roles:
-- [mon.a, mds.a, osd.0]
-- [mon.b, mds.a-s, osd.1]
-- [mon.c, client.0]
diff --git a/roles/overrides.yaml b/roles/overrides.yaml

deleted file mode 100644 (file)

index e93a2b2..0000000
--- a/roles/overrides.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-nuke-on-error: true
-kernel:
-  branch: main
-overrides:
-  ceph:
-    branch: BRANCH_NAME
-    log-ignorelist:
-    - 'clocks not synchronized'
-tasks:
-- chef:
diff --git a/scripts/nuke.py b/scripts/nuke.py

deleted file mode 100644 (file)

index 0b1644c..0000000
--- a/scripts/nuke.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import docopt
-
-import teuthology.nuke
-
-doc = """
-usage:
-  teuthology-nuke --help
-  teuthology-nuke [-v] [--owner OWNER] [-n NAME] [-u] [-i] [-r|-R] [-s] [-k]
-                       [-p PID] [--dry-run] (-t CONFIG... | -a DIR)
-  teuthology-nuke [-v] [-u] [-i] [-r] [-s] [--dry-run] --owner OWNER --stale
-  teuthology-nuke [-v] [--dry-run] --stale-openstack
-
-Reset test machines
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -v, --verbose         be more verbose
-  -t CONFIG [CONFIG ...], --targets CONFIG [CONFIG ...]
-                        yaml config containing machines to nuke
-  -a DIR, --archive DIR
-                        archive path for a job to kill and nuke
-  --stale               attempt to find and nuke 'stale' machines
-                        (e.g. locked by jobs that are no longer running)
-  --stale-openstack     nuke 'stale' OpenStack instances and volumes
-                        and unlock OpenStack targets with no instance
-  --dry-run             Don't actually nuke anything; just print the list of
-                        targets that would be nuked
-  --owner OWNER         job owner
-  -p PID, --pid PID     pid of the process to be killed
-  -r, --reboot-all      reboot all machines (default)
-  -R, --no-reboot       do not reboot the machines
-  -s, --synch-clocks    synchronize clocks on all machines
-  -u, --unlock          Unlock each successfully nuked machine, and output
-                        targets thatcould not be nuked.
-  -n NAME, --name NAME  Name of run to cleanup
-  -i, --noipmi          Skip ipmi checking
-  -k, --keep-logs       Preserve test directories and logs on the machines
-
-Examples:
-teuthology-nuke -t target.yaml --unlock --owner user@host
-teuthology-nuke -t target.yaml --pid 1234 --unlock --owner user@host
-"""
-
-
-def main():
-    args = docopt.docopt(doc)
-    teuthology.nuke.main(args)
diff --git a/scripts/test/test_nuke.py b/scripts/test/test_nuke.py

deleted file mode 100644 (file)

index fa615c4..0000000
--- a/scripts/test/test_nuke.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from script import Script
-
-
-class TestNuke(Script):
-    script_name = 'teuthology-nuke'
diff --git a/teuthology/nuke/__init__.py b/teuthology/nuke/__init__.py

index 8a2985b9eff5aa29458fe4f923bf2e47e935cc40..9c6eefe18f58ff86224a9992b4e4eaff324a2b6c 100644 (file)
--- a/teuthology/nuke/__init__.py
+++ b/teuthology/nuke/__init__.py
@@ -1,368 +1,20 @@
-import argparse
-import datetime
-import json
  import logging
-import os
-import subprocess
-
-import yaml
-
-import teuthology
-
-from teuthology import provision
-from teuthology.lock import ops as lock_ops
-from teuthology.lock import util
-from teuthology.lock.query import is_vm, list_locks, \
-    find_stale_locks, get_status
-from teuthology.nuke.actions import (
-    check_console, clear_firewall, shutdown_daemons, remove_installed_packages,
-    reboot, remove_osd_mounts, remove_osd_tmpfs, kill_hadoop,
-    remove_ceph_packages, synch_clocks, unlock_firmware_repo,
-    remove_configuration_files, undo_multipath, reset_syslog_dir,
-    remove_ceph_data, remove_testing_tree, remove_yum_timedhosts,
-    kill_valgrind,
-)
-from teuthology.config import config, FakeNamespace
-from teuthology.misc import (
-    canonicalize_hostname, config_file, decanonicalize_hostname, merge_configs,
-    get_user, sh
-)
-from teuthology.openstack import OpenStack, OpenStackInstance, enforce_json_dictionary
-from teuthology.orchestra import remote
-from teuthology.parallel import parallel
-from teuthology.task import internal
-from teuthology.task.internal import check_lock
  
  log = logging.getLogger(__name__)
  
  
-def openstack_volume_id(volume):
-    return (volume.get('ID') or volume['id'])
-
-
-def openstack_volume_name(volume):
-    return (volume.get('Display Name') or
-            volume.get('display_name') or
-            volume.get('Name') or
-            volume.get('name') or "")
-
-
-def stale_openstack(ctx):
-    targets = dict(map(lambda i: (i['ID'], i),
-                       OpenStack.list_instances()))
-    nodes = list_locks(keyed_by_name=True, locked=True)
-    stale_openstack_instances(ctx, targets, nodes)
-    stale_openstack_nodes(ctx, targets, nodes)
-    stale_openstack_volumes(ctx, OpenStack.list_volumes())
-    if not ctx.dry_run:
-        openstack_remove_again()
-
-#
-# A delay, in seconds, that is significantly longer than
-# any kind of OpenStack server creation / deletion / etc.
-#
-OPENSTACK_DELAY = 30 * 60
-
-
-def stale_openstack_instances(ctx, instances, locked_nodes):
-    for (instance_id, instance) in instances.items():
-        i = OpenStackInstance(instance_id)
-        if not i.exists():
-            log.debug("stale-openstack: {instance} disappeared, ignored"
-                      .format(instance=instance_id))
-            continue
-        if (i.get_created() >
-                config['max_job_time'] + OPENSTACK_DELAY):
-            log.info(
-                "stale-openstack: destroying instance {instance}"
-                " because it was created {created} seconds ago"
-                " which is older than"
-                " max_job_time {max_job_time} + {delay}"
-                .format(instance=i['name'],
-                        created=i.get_created(),
-                        max_job_time=config['max_job_time'],
-                        delay=OPENSTACK_DELAY))
-            if not ctx.dry_run:
-                i.destroy()
-            continue
-        name = canonicalize_hostname(i['name'], user=None)
-        if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes:
-            log.info("stale-openstack: destroying instance {instance}"
-                     " because it was created {created} seconds ago"
-                     " is older than {delay}s and it is not locked"
-                     .format(instance=i['name'],
-                             created=i.get_created(),
-                             delay=OPENSTACK_DELAY))
-            if not ctx.dry_run:
-                i.destroy()
-            continue
-        log.debug("stale-openstack: instance " + i['name'] + " OK")
-
-
-def openstack_delete_volume(id):
-    OpenStack().run("volume delete " + id + " || true")
-
-
-def stale_openstack_volumes(ctx, volumes):
-    now = datetime.datetime.now()
-    for volume in volumes:
-        volume_id = openstack_volume_id(volume)
-        try:
-            volume = json.loads(OpenStack().run("volume show -f json " +
-                                                volume_id))
-        except subprocess.CalledProcessError:
-            log.debug("stale-openstack: {id} disappeared, ignored"
-                      .format(id=volume_id))
-            continue
-        volume_name = openstack_volume_name(volume)
-        enforce_json_dictionary(volume)
-        created_at = datetime.datetime.strptime(
-            volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f')
-        created = (now - created_at).total_seconds()
-        if created > config['max_job_time'] + OPENSTACK_DELAY:
-            log.info(
-                "stale-openstack: destroying volume {volume}({id})"
-                " because it was created {created} seconds ago"
-                " which is older than"
-                " max_job_time {max_job_time} + {delay}"
-                .format(volume=volume_name,
-                        id=volume_id,
-                        created=created,
-                        max_job_time=config['max_job_time'],
-                        delay=OPENSTACK_DELAY))
-            if not ctx.dry_run:
-                openstack_delete_volume(volume_id)
-            continue
-        log.debug("stale-openstack: volume " + volume_id + " OK")
-
-
-def stale_openstack_nodes(ctx, instances, locked_nodes):
-    names = set([ i['Name'] for i in instances.values() ])
-    for (name, node) in locked_nodes.items():
-        name = decanonicalize_hostname(name)
-        if node['machine_type'] != 'openstack':
-            continue
-        if (name not in names and
-                util.locked_since_seconds(node) > OPENSTACK_DELAY):
-            log.info("stale-openstack: unlocking node {name} unlocked"
-                     " because it was created {created}"
-                     " seconds ago which is older than {delay}"
-                     " and it has no instance"
-                     .format(name=name,
-                             created=util.locked_since_seconds(node),
-                             delay=OPENSTACK_DELAY))
-            if not ctx.dry_run:
-                lock_ops.unlock_one(ctx, name, node['locked_by'])
-            continue
-        log.debug("stale-openstack: node " + name + " OK")
-
-
-def openstack_remove_again():
+# This is being kept because ceph.git/qa/tasks/cephfs/filesystem.py references it.
+def clear_firewall(ctx):
      """
-    Volumes and servers with REMOVE-ME in the name are leftover
-    that failed to be removed. It is not uncommon for a failed removal
-    to succeed later on.
+    Remove any iptables rules created by teuthology.  These rules are
+    identified by containing a comment with 'teuthology' in it.  Non-teuthology
+    firewall rules are unaffected.
      """
-    sh("""
-    openstack server list --name REMOVE-ME --column ID --format value |
-    xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait
-    true
-    """)
-    volumes = json.loads(OpenStack().run("volume list -f json --long"))
-    remove_me = [openstack_volume_id(v) for v in volumes
-                 if 'REMOVE-ME' in openstack_volume_name(v)]
-    for i in remove_me:
-        log.info("Trying to remove stale volume %s" % i)
-        openstack_delete_volume(i)
-
-
-def main(args):
-    ctx = FakeNamespace(args)
-    if ctx.verbose:
-        teuthology.log.setLevel(logging.DEBUG)
-
-    info = {}
-    if ctx.archive:
-        ctx.config = config_file(ctx.archive + '/config.yaml')
-        ifn = os.path.join(ctx.archive, 'info.yaml')
-        if os.path.exists(ifn):
-            with open(ifn, 'r') as fd:
-                info = yaml.safe_load(fd.read())
-        if not ctx.pid:
-            ctx.pid = info.get('pid')
-            if not ctx.pid:
-                ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n'))
-        if not ctx.owner:
-            ctx.owner = info.get('owner')
-            if not ctx.owner:
-                ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n')
-
-    if ctx.targets:
-        ctx.config = merge_configs(ctx.targets)
-
-    if ctx.stale:
-        stale_nodes = find_stale_locks(ctx.owner)
-        targets = dict()
-        for node in stale_nodes:
-            targets[node['name']] = node['ssh_pub_key']
-        ctx.config = dict(targets=targets)
-
-    if ctx.stale_openstack:
-        stale_openstack(ctx)
-        return
-
-    log.info(
-        '\n  '.join(
-            ['targets:', ] + yaml.safe_dump(
-                ctx.config['targets'],
-                default_flow_style=False).splitlines()))
-
-    if ctx.dry_run:
-        log.info("Not actually nuking anything since --dry-run was passed")
-        return
-
-    if ctx.owner is None:
-        ctx.owner = get_user()
-
-    if ctx.pid:
-        if ctx.archive:
-            log.info('Killing teuthology process at pid %d', ctx.pid)
-            os.system('grep -q %s /proc/%d/cmdline && sudo kill -9 %d' % (
-                ctx.archive,
-                ctx.pid,
-                ctx.pid))
-        else:
-            subprocess.check_call(["kill", "-9", str(ctx.pid)])
-
-    nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.noipmi, ctx.keep_logs, not ctx.no_reboot)
-
-
-def nuke(ctx, should_unlock, sync_clocks=True, noipmi=False, keep_logs=False, should_reboot=True):
-    if 'targets' not in ctx.config:
-        return
-    total_unnuked = {}
-    log.info('Checking targets against current locks')
-    with parallel() as p:
-        for target, hostkey in ctx.config['targets'].items():
-            status = get_status(target)
-            if ctx.name and ctx.name not in (status.get('description') or ""):
-                total_unnuked[target] = hostkey
-                log.info(
-                    f"Not nuking {target} because description doesn't match: "
-                    f"{ctx.name} != {status.get('description')}"
-                )
-                continue
-            elif status.get('up') is False:
-                total_unnuked[target] = hostkey
-                log.info(f"Not nuking {target} because it is down")
-                continue
-            p.spawn(
-                nuke_one,
-                ctx,
-                {target: hostkey},
-                should_unlock,
-                sync_clocks,
-                ctx.config.get('check-locks', True),
-                noipmi,
-                keep_logs,
-                should_reboot,
-            )
-        for unnuked in p:
-            if unnuked:
-                total_unnuked.update(unnuked)
-    if total_unnuked:
-        log.error('Could not nuke the following targets:\n' +
-                  '\n  '.join(['targets:', ] +
-                              yaml.safe_dump(
-                                  total_unnuked,
-                                  default_flow_style=False).splitlines()))
-
-
-def nuke_one(ctx, target, should_unlock, synch_clocks,
-             check_locks, noipmi, keep_logs, should_reboot):
-    ret = None
-    ctx = argparse.Namespace(
-        config=dict(targets=target),
-        owner=ctx.owner,
-        check_locks=check_locks,
-        synch_clocks=synch_clocks,
-        teuthology_config=config.to_dict(),
-        name=ctx.name,
-        noipmi=noipmi,
+    log.info("Clearing teuthology firewall rules...")
+    ctx.cluster.run(
+        args=[
+            "sudo", "sh", "-c",
+            "iptables-save | grep -v teuthology | iptables-restore"
+        ],
      )
-    try:
-        nuke_helper(ctx, should_unlock, keep_logs, should_reboot)
-    except Exception:
-        log.exception('Could not nuke %s' % target)
-        # not re-raising the so that parallel calls aren't killed
-        ret = target
-    else:
-        if should_unlock:
-            lock_ops.unlock_one(ctx, list(target.keys())[0], ctx.owner)
-    return ret
-
-
-def nuke_helper(ctx, should_unlock, keep_logs, should_reboot):
-    # ensure node is up with ipmi
-    (target,) = ctx.config['targets'].keys()
-    host = target.split('@')[-1]
-    shortname = host.split('.')[0]
-    if should_unlock:
-        if is_vm(shortname):
-            return
-    log.debug('shortname: %s' % shortname)
-    remote_ = remote.Remote(host)
-    if ctx.check_locks:
-        # does not check to ensure if the node is 'up'
-        # we want to be able to nuke a downed node
-        check_lock.check_lock(ctx, None, check_up=False)
-    status = get_status(host)
-    if status['machine_type'] in provision.fog.get_types():
-        remote_.console.power_off()
-        return
-    elif status['machine_type'] in provision.pelagos.get_types():
-        provision.pelagos.park_node(host)
-        return
-    elif remote_.is_container:
-        remote_.run(
-            args=['sudo', '/testnode_stop.sh'],
-            check_status=False,
-        )
-        return
-    if (not ctx.noipmi and 'ipmi_user' in config and
-            'vpm' not in shortname):
-        try:
-            check_console(host)
-        except Exception:
-            log.exception('')
-            log.info("Will attempt to connect via SSH")
-            remote_ = remote.Remote(host)
-            remote_.connect()
-    internal.add_remotes(ctx, None)
-    internal.connect(ctx, None)
-    clear_firewall(ctx)
-    shutdown_daemons(ctx)
-    kill_valgrind(ctx)
-    # Try to remove packages before reboot
-    remove_installed_packages(ctx)
-    remotes = ctx.cluster.remotes.keys()
-    if should_reboot:
-        reboot(ctx, remotes)
-    # shutdown daemons again incase of startup
-    shutdown_daemons(ctx)
-    remove_osd_mounts(ctx)
-    remove_osd_tmpfs(ctx)
-    kill_hadoop(ctx)
-    remove_ceph_packages(ctx)
-    synch_clocks(remotes)
-    unlock_firmware_repo(ctx)
-    remove_configuration_files(ctx)
-    undo_multipath(ctx)
-    reset_syslog_dir(ctx)
-    remove_ceph_data(ctx)
-    if not keep_logs:
-        remove_testing_tree(ctx)
-    remove_yum_timedhosts(ctx)
-    # Once again remove packages after reboot
-    remove_installed_packages(ctx)
-    log.info('Installed packages removed.')
+    log.info("Cleared teuthology firewall rules.")
diff --git a/teuthology/nuke/actions.py b/teuthology/nuke/actions.py

deleted file mode 100644 (file)

index 621b088..0000000
--- a/teuthology/nuke/actions.py
+++ /dev/null
@@ -1,459 +0,0 @@
-import logging
-import time
-
-from teuthology.misc import get_testdir, reconnect
-from teuthology.orchestra import remote as remote_mod, run
-from teuthology.task import install as install_task
-
-
-log = logging.getLogger(__name__)
-
-
-def clear_firewall(ctx):
-    """
-    Remove any iptables rules created by teuthology.  These rules are
-    identified by containing a comment with 'teuthology' in it.  Non-teuthology
-    firewall rules are unaffected.
-    """
-    log.info("Clearing teuthology firewall rules...")
-    ctx.cluster.run(
-        args=[
-            "sudo", "sh", "-c",
-            "iptables-save | grep -v teuthology | iptables-restore"
-        ],
-    )
-    log.info("Cleared teuthology firewall rules.")
-
-
-def shutdown_daemons(ctx):
-    log.info('Unmounting ceph-fuse and killing daemons...')
-    ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
-                          'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
-                          'sudo', 'systemctl', 'stop', 'ceph.target'],
-                    check_status=False, timeout=180)
-    ctx.cluster.run(
-        args=[
-            'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'),
-            'then',
-            'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'),
-            'grep', '-o', " /.* fuse", run.Raw('|'),
-            'grep', '-o', "/.* ", run.Raw('|'),
-            'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
-            'fi',
-            run.Raw(';'),
-            'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'),
-            'then',
-            'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'),
-            'grep', '-o', " /.* fuse", run.Raw('|'),
-            'grep', '-o', "/.* ", run.Raw('|'),
-            'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
-            'fi',
-            run.Raw(';'),
-            'sudo',
-            'killall',
-            '--quiet',
-            'ceph-mon',
-            'ceph-osd',
-            'ceph-mds',
-            'ceph-mgr',
-            'ceph-fuse',
-            'ceph-disk',
-            'radosgw',
-            'ceph_test_rados',
-            'rados',
-            'rbd-fuse',
-            'apache2',
-            run.Raw('||'),
-            'true',  # ignore errors from ceph binaries not being found
-        ],
-        timeout=120,
-    )
-    log.info('All daemons killed.')
-
-
-def kill_hadoop(ctx):
-    log.info("Terminating Hadoop services...")
-    ctx.cluster.run(args=[
-        "pkill", "-f", "-KILL", "java.*hadoop",
-        ],
-        check_status=False,
-        timeout=60
-    )
-
-
-def kill_valgrind(ctx):
-    # http://tracker.ceph.com/issues/17084
-    ctx.cluster.run(
-        args=['sudo', 'pkill', '-f', '-9', 'valgrind.bin'],
-        check_status=False,
-        timeout=20,
-    )
-
-
-def remove_osd_mounts(ctx):
-    """
-    unmount any osd data mounts (scratch disks)
-    """
-    log.info('Unmount any osd data directories...')
-    ctx.cluster.run(
-        args=[
-            'grep',
-            '/var/lib/ceph/osd/',
-            '/etc/mtab',
-            run.Raw('|'),
-            'awk', '{print $2}', run.Raw('|'),
-            'xargs', '-r',
-            'sudo', 'umount', '-l', run.Raw(';'),
-            'true'
-        ],
-        timeout=120
-    )
-
-
-def remove_osd_tmpfs(ctx):
-    """
-    unmount tmpfs mounts
-    """
-    log.info('Unmount any osd tmpfs dirs...')
-    ctx.cluster.run(
-        args=[
-            'egrep', r'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'),
-            'awk', '{print $2}', run.Raw('|'),
-            'xargs', '-r',
-            'sudo', 'umount', run.Raw(';'),
-            'true'
-        ],
-        timeout=120
-    )
-
-
-def stale_kernel_mount(remote):
-    proc = remote.run(
-        args=[
-            'sudo', 'find',
-            '/sys/kernel/debug/ceph',
-            '-mindepth', '1',
-            run.Raw('!'),
-            '-path', '/sys/kernel/debug/ceph/meta',
-            run.Raw('!'),
-            '-path', '/sys/kernel/debug/ceph/meta/client_features',
-            '-type', 'd',
-            run.Raw('|'),
-            'read'
-        ],
-        check_status=False
-    )
-    return proc.exitstatus == 0
-
-
-def reboot(ctx, remotes):
-    for remote in remotes:
-        if stale_kernel_mount(remote):
-            log.warning('Stale kernel mount on %s!', remote.name)
-            log.info('force/no-sync rebooting %s', remote.name)
-            # -n is ignored in systemd versions through v229, which means this
-            # only works on trusty -- on 7.3 (v219) and xenial (v229) reboot -n
-            # still calls sync().
-            # args = ['sync', run.Raw('&'),
-            #         'sleep', '5', run.Raw(';'),
-            #         'sudo', 'reboot', '-f', '-n']
-            args = ['for', 'sysrq', 'in', 's', 'u', 'b', run.Raw(';'),
-                    'do', 'echo', run.Raw('$sysrq'), run.Raw('|'),
-                    'sudo', 'tee', '/proc/sysrq-trigger', run.Raw(';'),
-                    'done']
-        else:
-            log.info('rebooting %s', remote.name)
-            args = ['sudo', 'reboot']
-        try:
-            remote.run(args=args, wait=False)
-        except Exception:
-            log.exception('ignoring exception during reboot command')
-        # we just ignore these procs because reboot -f doesn't actually
-        # send anything back to the ssh client!
-    if remotes:
-        log.info('waiting for nodes to reboot')
-        time.sleep(8)  # if we try and reconnect too quickly, it succeeds!
-        reconnect(ctx, 480)  # allow 8 minutes for the reboots
-
-
-def reset_syslog_dir(ctx):
-    log.info('Resetting syslog output locations...')
-    nodes = {}
-    for remote in ctx.cluster.remotes.keys():
-        proc = remote.run(
-            args=[
-                'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
-                run.Raw(';'),
-                'then',
-                'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
-                run.Raw('&&'),
-                'sudo', 'service', 'rsyslog', 'restart',
-                run.Raw(';'),
-                'fi',
-                run.Raw(';'),
-            ],
-            timeout=60,
-        )
-        nodes[remote.name] = proc
-
-    for name, proc in nodes.items():
-        log.info('Waiting for %s to restart syslog...', name)
-        proc.wait()
-
-
-def dpkg_configure(ctx):
-    for remote in ctx.cluster.remotes.keys():
-        if remote.os.package_type != 'deb':
-            continue
-        log.info(
-            'Waiting for dpkg --configure -a and apt-get -f install...')
-        remote.run(
-            args=[
-                'sudo', 'dpkg', '--configure', '-a',
-                run.Raw(';'),
-                'sudo', 'DEBIAN_FRONTEND=noninteractive',
-                'apt-get', '-y', '--force-yes', '-f', 'install',
-                run.Raw('||'),
-                ':',
-            ],
-            timeout=180,
-            check_status=False,
-        )
-
-
-def remove_yum_timedhosts(ctx):
-    # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329
-    log.info("Removing yum timedhosts files...")
-    for remote in ctx.cluster.remotes.keys():
-        if remote.os.package_type != 'rpm':
-            continue
-        remote.run(
-            args=r"sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;",
-            check_status=False, timeout=180
-        )
-
-
-def remove_ceph_packages(ctx):
-    """
-    remove ceph and ceph dependent packages by force
-    force is needed since the node's repo might have changed and
-    in many cases autocorrect will not work due to missing packages
-    due to repo changes
-    """
-    log.info("Force remove ceph packages")
-    ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd',
-                               'libcephfs1', 'libcephfs2',
-                               'librados2', 'librgw2', 'librbd1', 'python-rgw',
-                               'ceph-selinux', 'python-cephfs', 'ceph-base',
-                               'python-rbd', 'python-rados', 'ceph-mds',
-                               'ceph-mgr', 'libcephfs-java', 'libcephfs-jni',
-                               'ceph-deploy', 'libapache2-mod-fastcgi'
-                               ]
-    pkgs = str.join(' ', ceph_packages_to_remove)
-    for remote in ctx.cluster.remotes.keys():
-        if remote.os.package_type == 'rpm':
-            log.info("Remove any broken repos")
-            dist_release = remote.os.name
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")],
-                check_status=False
-            )
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")],
-                check_status=False,
-            )
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*samba*")],
-                check_status=False,
-            )
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*nfs-ganesha*")],
-                check_status=False,
-            )
-            remote.run(
-                args=['sudo', 'rpm', '--rebuilddb']
-            )
-            if dist_release in ['opensuse', 'sle']:
-                remote.sh('sudo zypper clean')
-                log.info('Remove any ceph packages')
-                remote.sh('sudo zypper remove --non-interactive',
-                    check_status=False
-                )
-            else:
-                remote.sh('sudo yum clean all')
-                log.info('Remove any ceph packages')
-                remote.sh('sudo yum remove -y', check_status=False)
-        else:
-            log.info("Remove any broken repos")
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")],
-                check_status=False,
-            )
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*samba*")],
-                check_status=False,
-            )
-            remote.run(
-                args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*nfs-ganesha*")],
-                check_status=False,
-            )
-            log.info("Autoclean")
-            remote.run(
-                args=['sudo', 'apt-get', 'autoclean'],
-                check_status=False,
-            )
-            log.info('Remove any ceph packages')
-            remote.run(
-                args=[
-                    'sudo', 'dpkg', '--remove', '--force-remove-reinstreq',
-                    run.Raw(pkgs)
-                ],
-                check_status=False
-            )
-            log.info("Autoclean")
-            remote.run(
-                args=['sudo', 'apt-get', 'autoclean']
-            )
-
-
-def remove_installed_packages(ctx):
-    dpkg_configure(ctx)
-    conf = dict(
-        project='ceph',
-        debuginfo='true',
-    )
-    packages = install_task.get_package_list(ctx, conf)
-    debs = packages['deb'] + \
-        ['salt-common', 'salt-minion', 'calamari-server',
-         'python-rados', 'multipath-tools']
-    rpms = packages['rpm'] + \
-        ['salt-common', 'salt-minion', 'calamari-server',
-         'multipath-tools', 'device-mapper-multipath']
-    install_task.remove_packages(
-        ctx,
-        conf,
-        dict(
-            deb=debs,
-            rpm=rpms,
-        )
-    )
-    install_task.remove_sources(ctx, conf)
-
-
-def remove_ceph_data(ctx):
-    log.info("Removing any stale ceph data...")
-    ctx.cluster.run(
-        args=[
-            'sudo', 'rm', '-rf', '/etc/ceph',
-            run.Raw('/var/run/ceph*'),
-        ],
-    )
-
-
-def remove_testing_tree(ctx):
-    log.info('Clearing filesystem of test data...')
-    ctx.cluster.run(
-        args=[
-            'sudo', 'rm', '-rf', get_testdir(ctx),
-            # just for old time's sake
-            run.Raw('&&'),
-            'sudo', 'rm', '-rf', '/tmp/cephtest',
-            run.Raw('&&'),
-            'sudo', 'rm', '-rf', '/home/ubuntu/cephtest',
-        ],
-    )
-
-
-def remove_configuration_files(ctx):
-    """
-    Goes through a list of commonly used configuration files used for testing
-    that should not be left behind.
-
-    For example, sometimes ceph-deploy may be configured via
-    ``~/.cephdeploy.conf`` to alter how it handles installation by specifying
-    a default section in its config with custom locations.
-    """
-    ctx.cluster.run(
-        args=[
-            'rm', '-f', '/home/ubuntu/.cephdeploy.conf'
-        ],
-        timeout=30
-    )
-
-
-def undo_multipath(ctx):
-    """
-    Undo any multipath device mappings created, an
-    remove the packages/daemon that manages them so they don't
-    come back unless specifically requested by the test.
-    """
-    log.info('Removing any multipath config/pkgs...')
-    for remote in ctx.cluster.remotes.keys():
-        remote.run(
-            args=[
-                'sudo', 'multipath', '-F',
-            ],
-            check_status=False,
-            timeout=60
-        )
-
-
-def synch_clocks(remotes):
-    log.info('Synchronizing clocks...')
-    for remote in remotes:
-        remote.run(
-            args=[
-                'sudo', 'systemctl', 'stop', 'ntp.service', run.Raw('||'),
-                'sudo', 'systemctl', 'stop', 'ntpd.service', run.Raw('||'),
-                'sudo', 'systemctl', 'stop', 'chronyd.service',
-                run.Raw('&&'),
-                'sudo', 'ntpdate-debian', run.Raw('||'),
-                'sudo', 'ntp', '-gq', run.Raw('||'),
-                'sudo', 'ntpd', '-gq', run.Raw('||'),
-                'sudo', 'chronyc', 'sources',
-                run.Raw('&&'),
-                'sudo', 'hwclock', '--systohc', '--utc',
-                run.Raw('&&'),
-                'sudo', 'systemctl', 'start', 'ntp.service', run.Raw('||'),
-                'sudo', 'systemctl', 'start', 'ntpd.service', run.Raw('||'),
-                'sudo', 'systemctl', 'start', 'chronyd.service',
-                run.Raw('||'),
-                'true',    # ignore errors; we may be racing with ntpd startup
-            ],
-            timeout=60,
-        )
-
-
-def unlock_firmware_repo(ctx):
-    log.info('Making sure firmware.git is not locked...')
-    ctx.cluster.run(args=['sudo', 'rm', '-f',
-                          '/lib/firmware/updates/.git/index.lock', ])
-
-
-def check_console(hostname):
-    remote = remote_mod.Remote(hostname)
-    shortname = remote.shortname
-    console = remote.console
-    if not console:
-        return
-    cname = '{host}.{domain}'.format(
-        host=shortname,
-        domain=console.ipmidomain,
-    )
-    log.info('checking console status of %s' % cname)
-    if console.check_status():
-        log.info('console ready on %s' % cname)
-        return
-    if console.check_power('on'):
-        log.info('attempting to reboot %s' % cname)
-        console.power_cycle()
-    else:
-        log.info('attempting to power on %s' % cname)
-        console.power_on()
-    timeout = 100
-    log.info('checking console status of %s with timeout %s' %
-             (cname, timeout))
-    if console.check_status(timeout=timeout):
-        log.info('console ready on %s' % cname)
-    else:
-        log.error("Failed to get console status for %s, " % cname)
diff --git a/teuthology/openstack/test/suites/nuke/nuke.yaml b/teuthology/openstack/test/suites/nuke/nuke.yaml

deleted file mode 100644 (file)

index 9ffd7ac..0000000
--- a/teuthology/openstack/test/suites/nuke/nuke.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-stop_worker: true
-nuke-on-error: true
-roles:
-- - client.0
-tasks:
-- exec:
-    client.0:
-      - exit 1
diff --git a/teuthology/test/test_nuke.py b/teuthology/test/test_nuke.py

deleted file mode 100644 (file)

index b061d89..0000000
--- a/teuthology/test/test_nuke.py
+++ /dev/null
@@ -1,276 +0,0 @@
-import datetime
-import json
-import os
-import pytest
-import subprocess
-
-from unittest.mock import patch, Mock, DEFAULT, ANY
-
-from teuthology import nuke
-from teuthology import misc
-from teuthology.config import config
-from teuthology.dispatcher.supervisor import create_fake_context
-
-class TestNuke(object):
-
-    #@pytest.mark.skipif('OS_AUTH_URL' not in os.environ,
-    #                    reason="no OS_AUTH_URL environment variable")
-    def test_stale_openstack_volumes(self):
-        ctx = Mock()
-        ctx.teuthology_config = config
-        ctx.dry_run = False
-        now = datetime.datetime.strftime(datetime.datetime.now(),
-                                         "%Y-%m-%dT%H:%M:%S.000000")
-        id = '4bee3af9-febb-40c1-a17e-ff63edb415c5'
-        name = 'target1-0'
-        volume_list = json.loads(
-            '[{'
-            ' "ID": "' + id + '"'
-            '}]'
-        )
-        #
-        # A volume created a second ago is left untouched
-        #
-        volume_show = (
-            '{"id": "' + id + '", '
-            '"created_at": "' + now + '", '
-            '"display_name": "' + name + '"}'
-        )
-
-        with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol:
-            with patch.object(nuke.OpenStack, 'run') as m_os_run:
-                m_os_run.return_value = volume_show
-                nuke.stale_openstack_volumes(ctx, volume_list)
-                m_os_del_vol.assert_not_called()
-
-
-        #
-        # A volume created long ago is destroyed
-        #
-        ancient = "2000-11-02T15:43:12.000000"
-        volume_show = (
-            '{"id": "' + id + '", '
-            '"created_at": "' + ancient + '", '
-            '"display_name": "' + name + '"}'
-        )
-
-        with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol:
-            with patch.object(nuke.OpenStack, 'run') as m_os_run:
-                m_os_run.return_value = volume_show
-                nuke.stale_openstack_volumes(ctx, volume_list)
-                m_os_del_vol.assert_called_with(id)
-
-        #
-        # A volume that no longer exists is ignored
-        #
-        with patch('teuthology.nuke.openstack_delete_volume') as m_os_del_vol:
-            with patch.object(nuke.OpenStack, 'run') as m_os_run:
-                m_os_run.side_effect = subprocess.CalledProcessError('ERROR', 'FAIL')
-                nuke.stale_openstack_volumes(ctx, volume_list)
-                m_os_del_vol.assert_not_called()
-
-    def test_stale_openstack_nodes(self):
-        ctx = Mock()
-        ctx.teuthology_config = config
-        ctx.dry_run = False
-        name = 'target1'
-        uuid = 'UUID1'
-        now = datetime.datetime.strftime(datetime.datetime.now(),
-                                         "%Y-%m-%d %H:%M:%S.%f")
-        #
-        # A node is not of type openstack is left untouched
-        #
-        with patch("teuthology.lock.ops.unlock_one") as m_unlock_one:
-            nuke.stale_openstack_nodes(
-                ctx,
-                {},
-                {name: {'locked_since': now, 'machine_type': 'mira'}},
-            )
-            m_unlock_one.assert_not_called()
-        #
-        # A node that was just locked and does not have
-        # an instance yet is left untouched
-        #
-        with patch("teuthology.lock.ops.unlock_one") as m_unlock_one:
-            nuke.stale_openstack_nodes(
-                ctx,
-                {},
-                {name: {'locked_since': now, 'machine_type': 'openstack'}},
-            )
-            m_unlock_one.assert_not_called()
-        #
-        # A node that has been locked for some time and
-        # has no instance is unlocked.
-        #
-        ancient = "2000-11-02 15:43:12.000000"
-        me = 'loic@dachary.org'
-        with patch("teuthology.lock.ops.unlock_one") as m_unlock_one:
-            nuke.stale_openstack_nodes(
-                ctx,
-                {},
-                {name: {'locked_since': ancient, 'locked_by': me, 'machine_type': 'openstack'}},
-            )
-            m_unlock_one.assert_called_with(ctx, name, me)
-        #
-        # A node that has been locked for some time and
-        # has an instance is left untouched
-        #
-        with patch("teuthology.lock.ops.unlock_one") as m_unlock_one:
-            nuke.stale_openstack_nodes(
-                ctx,
-                {uuid: {'ID': uuid, 'Name': name}},
-                {name: {'locked_since': ancient, 'machine_type': 'openstack'}},
-            )
-            m_unlock_one.assert_not_called()
-
-    def test_stale_openstack_instances(self):
-        if 'OS_AUTH_URL' not in os.environ:
-            pytest.skip('no OS_AUTH_URL environment variable')
-        ctx = Mock()
-        ctx.teuthology_config = config
-        ctx.dry_run = False
-        name = 'target1'
-        uuid = 'UUID1'
-        #
-        # An instance created a second ago is left untouched,
-        # even when it is not locked.
-        #
-        with patch.multiple(
-                nuke.OpenStackInstance,
-                exists=lambda _: True,
-                get_created=lambda _: 1,
-                __getitem__=lambda _, key: name,
-                destroy=DEFAULT,
-                ) as m:
-            nuke.stale_openstack_instances(ctx, {
-                uuid: { 'Name': name, },
-            }, {
-            })
-            m['destroy'].assert_not_called()
-        #
-        # An instance created a very long time ago is destroyed
-        #
-        with patch.multiple(
-                nuke.OpenStackInstance,
-                exists=lambda _: True,
-                get_created=lambda _: 1000000000,
-                __getitem__=lambda _, key: name,
-                destroy=DEFAULT,
-                ) as m:
-            nuke.stale_openstack_instances(ctx, {
-                uuid: { 'Name': name, },
-            }, {
-                misc.canonicalize_hostname(name, user=None): {},
-            })
-            m['destroy'].assert_called_with()
-        #
-        # An instance that turns out to not exist any longer
-        # is ignored.
-        #
-        with patch.multiple(
-                nuke.OpenStackInstance,
-                exists=lambda _: False,
-                __getitem__=lambda _, key: name,
-                destroy=DEFAULT,
-                ) as m:
-            nuke.stale_openstack_instances(ctx, {
-                uuid: { 'Name': name, },
-            }, {
-                misc.canonicalize_hostname(name, user=None): {},
-            })
-            m['destroy'].assert_not_called()
-        #
-        # An instance created but not locked after a while is
-        # destroyed.
-        #
-        with patch.multiple(
-                nuke.OpenStackInstance,
-                exists=lambda _: True,
-                get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
-                __getitem__=lambda _, key: name,
-                destroy=DEFAULT,
-                ) as m:
-            nuke.stale_openstack_instances(ctx, {
-                uuid: { 'Name': name, },
-            }, {
-            })
-            m['destroy'].assert_called_with()
-        #
-        # An instance created within the expected lifetime
-        # of a job and locked is left untouched.
-        #
-        with patch.multiple(
-                nuke.OpenStackInstance,
-                exists=lambda _: True,
-                get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
-                __getitem__=lambda _, key: name,
-                destroy=DEFAULT,
-                ) as m:
-            nuke.stale_openstack_instances(ctx, {
-                uuid: { 'Name': name, },
-            }, {
-                misc.canonicalize_hostname(name, user=None): {},
-            })
-            m['destroy'].assert_not_called()
-
-
-@patch("teuthology.lock.ops.unlock_one")
-def test_nuke_internal(m_unlock_one):
-    job_config = dict(
-        owner='test_owner',
-        targets={'user@host1': 'key1', 'user@host2': 'key2'},
-        archive_path='/path/to/test_run',
-        machine_type='test_machine',
-        os_type='centos',
-        os_version='8.3',
-        name='test_name',
-    )
-    statuses = {
-        target: {'name': target, 'description': job_config['name']}
-        for target in job_config['targets'].keys()
-    }
-    ctx = create_fake_context(job_config)
-
-    # minimal call using defaults
-    with patch.multiple(
-            nuke,
-            nuke_helper=DEFAULT,
-            get_status=lambda i: statuses[i],
-            ) as m:
-        nuke.nuke(ctx, True)
-        m['nuke_helper'].assert_called_with(ANY, True, False, True)
-        m_unlock_one.assert_called()
-    m_unlock_one.reset_mock()
-
-    # don't unlock
-    with patch.multiple(
-            nuke,
-            nuke_helper=DEFAULT,
-            get_status=lambda i: statuses[i],
-            ) as m:
-        nuke.nuke(ctx, False)
-        m['nuke_helper'].assert_called_with(ANY, False, False, True)
-        m_unlock_one.assert_not_called()
-    m_unlock_one.reset_mock()
-
-    # mimicing what teuthology-dispatcher --supervisor does
-    with patch.multiple(
-            nuke,
-            nuke_helper=DEFAULT,
-            get_status=lambda i: statuses[i],
-            ) as m:
-        nuke.nuke(ctx, False, True, False, True, False)
-        m['nuke_helper'].assert_called_with(ANY, False, True, False)
-        m_unlock_one.assert_not_called()
-    m_unlock_one.reset_mock()
-
-    # no targets
-    del ctx.config['targets']
-    with patch.multiple(
-            nuke,
-            nuke_helper=DEFAULT,
-            get_status=lambda i: statuses[i],
-            ) as m:
-        nuke.nuke(ctx, True)
-        m['nuke_helper'].assert_not_called()
-        m_unlock_one.assert_not_called()
author	Zack Cerza <zack@redhat.com>
	Thu, 1 Feb 2024 00:27:35 +0000 (17:27 -0700)
committer	Zack Cerza <zack@redhat.com>
	Fri, 2 Feb 2024 18:47:11 +0000 (11:47 -0700)
docs/commands/teuthology-nuke.rst	[deleted file]	patch \| blob \| history
roles/3-simple.yaml	[deleted file]	patch \| blob \| history
roles/overrides.yaml	[deleted file]	patch \| blob \| history
scripts/nuke.py	[deleted file]	patch \| blob \| history
scripts/test/test_nuke.py	[deleted file]	patch \| blob \| history
teuthology/nuke/__init__.py		patch \| blob \| history
teuthology/nuke/actions.py	[deleted file]	patch \| blob \| history
teuthology/openstack/test/suites/nuke/nuke.yaml	[deleted file]	patch \| blob \| history
teuthology/test/test_nuke.py	[deleted file]	patch \| blob \| history