From: Zack Cerza Date: Tue, 16 Aug 2016 19:52:20 +0000 (-0600) Subject: Make teuthology.nuke a subpackage X-Git-Tag: 1.1.0~548^2~7 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=21ceb15e4005064edb52405efdb603ef131638b7;p=teuthology.git Make teuthology.nuke a subpackage Signed-off-by: Zack Cerza --- diff --git a/teuthology/nuke.py b/teuthology/nuke.py deleted file mode 100644 index 3a6f6f8057..0000000000 --- a/teuthology/nuke.py +++ /dev/null @@ -1,748 +0,0 @@ -import argparse -import datetime -import json -import logging -import os -import subprocess -import time -import yaml - -import teuthology -from . import orchestra -import orchestra.remote -from .openstack import OpenStack, OpenStackInstance, enforce_json_dictionary -from .orchestra import run -from .config import config, FakeNamespace -from .lock import list_locks -from .lock import locked_since_seconds -from .lock import unlock_one -from .lock import find_stale_locks -from .lockstatus import get_status -from .misc import canonicalize_hostname -from .misc import config_file -from .misc import decanonicalize_hostname -from .misc import merge_configs -from .misc import get_testdir -from .misc import get_user -from .misc import reconnect -from .misc import sh -from .parallel import parallel -from .task import install as install_task -from .task.internal import check_lock, add_remotes, connect - -log = logging.getLogger(__name__) - - -def clear_firewall(ctx): - """ - Remove any iptables rules created by teuthology. These rules are - identified by containing a comment with 'teuthology' in it. Non-teuthology - firewall rules are unaffected. - """ - ctx.cluster.run( - args=[ - "sudo", "sh", "-c", - "iptables-save | grep -v teuthology | iptables-restore" - ], - ) - - -def shutdown_daemons(ctx): - ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), - 'sudo', 'systemctl', 'stop', 'ceph.target'], - check_status=False, timeout=180) - ctx.cluster.run( - args=[ - 'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'), - 'then', - 'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'), - 'grep', '-o', " /.* fuse", run.Raw('|'), - 'grep', '-o', "/.* ", run.Raw('|'), - 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), - 'fi', - run.Raw(';'), - 'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'), - 'then', - 'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'), - 'grep', '-o', " /.* fuse", run.Raw('|'), - 'grep', '-o', "/.* ", run.Raw('|'), - 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), - 'fi', - run.Raw(';'), - 'sudo', - 'killall', - '--quiet', - 'ceph-mon', - 'ceph-osd', - 'ceph-mds', - 'ceph-fuse', - 'ceph-disk', - 'radosgw', - 'ceph_test_rados', - 'rados', - 'rbd-fuse', - 'apache2', - run.Raw('||'), - 'true', # ignore errors from ceph binaries not being found - ], - timeout=120, - ) - - -def kill_hadoop(ctx): - ctx.cluster.run(args=[ - "ps", "-ef", - run.Raw("|"), "grep", "java.*hadoop", - run.Raw("|"), "grep", "-v", "grep", - run.Raw("|"), 'awk', '{print $2}', - run.Raw("|"), 'xargs', 'kill', '-9', - ], check_status=False, timeout=60) - - -def remove_kernel_mounts(ctx): - """ - properly we should be able to just do a forced unmount, - but that doesn't seem to be working, so you should reboot instead - """ - log.info('clearing kernel mount from all nodes') - ctx.cluster.run( - args=[ - 'grep', 'ceph', '/etc/mtab', run.Raw('|'), - 'grep', '-o', "on /.* type", run.Raw('|'), - 'grep', '-o', "/.* ", run.Raw('|'), - 'xargs', '-r', - 'sudo', 'umount', '-f', run.Raw(';'), - ], - check_status=False, - timeout=60 - ) - - -def remove_osd_mounts(ctx): - """ - unmount any osd data mounts (scratch disks) - """ - ctx.cluster.run( - args=[ - 'grep', - '/var/lib/ceph/osd/', - '/etc/mtab', - run.Raw('|'), - 'awk', '{print $2}', run.Raw('|'), - 'xargs', '-r', - 'sudo', 'umount', run.Raw(';'), - 'true' - ], - timeout=120 - ) - - -def remove_osd_tmpfs(ctx): - """ - unmount tmpfs mounts - """ - ctx.cluster.run( - args=[ - 'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'), - 'awk', '{print $2}', run.Raw('|'), - 'xargs', '-r', - 'sudo', 'umount', run.Raw(';'), - 'true' - ], - timeout=120 - ) - - -def reboot(ctx, remotes): - nodes = {} - for remote in remotes: - log.info('rebooting %s', remote.name) - try: - proc = remote.run( - args=[ - 'sync', - run.Raw('&'), - 'sleep', '5', - run.Raw(';'), - 'sudo', 'reboot', - ], - wait=False, - ) - except Exception: - log.exception('ignoring exception during reboot command') - nodes[remote] = proc - # we just ignore these procs because reboot -f doesn't actually - # send anything back to the ssh client! - # for remote, proc in nodes.iteritems(): - # proc.wait() - if remotes: - log.info('waiting for nodes to reboot') - time.sleep(8) # if we try and reconnect too quickly, it succeeds! - reconnect(ctx, 480) # allow 8 minutes for the reboots - - -def reset_syslog_dir(ctx): - nodes = {} - for remote in ctx.cluster.remotes.iterkeys(): - proc = remote.run( - args=[ - 'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf', - run.Raw(';'), - 'then', - 'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf', - run.Raw('&&'), - 'sudo', 'service', 'rsyslog', 'restart', - run.Raw(';'), - 'fi', - run.Raw(';'), - ], - timeout=60, - ) - nodes[remote.name] = proc - - for name, proc in nodes.iteritems(): - log.info('Waiting for %s to restart syslog...', name) - proc.wait() - - -def dpkg_configure(ctx): - for remote in ctx.cluster.remotes.iterkeys(): - if remote.os.package_type != 'deb': - continue - log.info( - 'Waiting for dpkg --configure -a and apt-get -f install...') - remote.run( - args=[ - 'sudo', 'dpkg', '--configure', '-a', - run.Raw(';'), - 'sudo', 'DEBIAN_FRONTEND=noninteractive', - 'apt-get', '-y', '--force-yes', '-f', 'install', - run.Raw('||'), - ':', - ], - timeout=180, - check_status=False, - ) - - -def remove_yum_timedhosts(ctx): - # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329 - log.info("Removing yum timedhosts files...") - for remote in ctx.cluster.remotes.iterkeys(): - if remote.os.package_type != 'rpm': - continue - remote.run( - args="sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;", - check_status=False, timeout=180 - ) - - -def remove_ceph_packages(ctx): - """ - remove ceph and ceph dependent packages by force - force is needed since the node's repo might have changed and - in many cases autocorrect will not work due to missing packages - due to repo changes - """ - ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd', - 'libcephfs1', 'librados2', 'librgw2', 'librbd1', - 'ceph-selinux', 'python-cephfs', 'ceph-base', - 'python-rbd', 'python-rados', 'ceph-mds', - 'libcephfs-java', 'libcephfs-jni', - 'ceph-deploy', 'libapache2-mod-fastcgi' - ] - pkgs = str.join(' ', ceph_packages_to_remove) - for remote in ctx.cluster.remotes.iterkeys(): - if remote.os.package_type == 'rpm': - log.info("Remove any broken repos") - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")], - check_status=False - ) - remote.run( - args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")], - check_status=False, - ) - remote.run( - args=['sudo', 'rpm', '--rebuilddb', run.Raw('&&'), 'yum', - 'clean', 'all'] - ) - log.info('Remove any ceph packages') - remote.run( - args=['sudo', 'yum', 'remove', '-y', run.Raw(pkgs)], - check_status=False - ) - else: - log.info("Remove any broken repos") - remote.run( - args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")], - check_status=False, - ) - log.info("Autoclean") - remote.run( - args=['sudo', 'apt-get', 'autoclean'], - check_status=False, - ) - log.info('Remove any ceph packages') - remote.run( - args=[ - 'sudo', 'dpkg', '--remove', '--force-remove-reinstreq', - run.Raw(pkgs) - ], - check_status=False - ) - log.info("Autoclean") - remote.run( - args=['sudo', 'apt-get', 'autoclean'] - ) - - -def remove_installed_packages(ctx): - dpkg_configure(ctx) - conf = dict( - project='ceph', - debuginfo='true', - ) - packages = install_task.get_package_list(ctx, conf) - debs = packages['deb'] + \ - ['salt-common', 'salt-minion', 'calamari-server', - 'python-rados', 'multipath-tools'] - rpms = packages['rpm'] + \ - ['salt-common', 'salt-minion', 'calamari-server', - 'multipath-tools', 'device-mapper-multipath'] - install_task.remove_packages( - ctx, - conf, - dict( - deb=debs, - rpm=rpms, - ) - ) - install_task.remove_sources(ctx, conf) - install_task.purge_data(ctx) - - -def remove_testing_tree(ctx): - ctx.cluster.run( - args=[ - 'sudo', 'rm', '-rf', get_testdir(ctx), - # just for old time's sake - run.Raw('&&'), - 'sudo', 'rm', '-rf', '/tmp/cephtest', - run.Raw('&&'), - 'sudo', 'rm', '-rf', '/home/ubuntu/cephtest', - run.Raw('&&'), - 'sudo', 'rm', '-rf', '/etc/ceph', - ], - ) - - -def remove_configuration_files(ctx): - """ - Goes through a list of commonly used configuration files used for testing - that should not be left behind. - - For example, sometimes ceph-deploy may be configured via - ``~/.cephdeploy.conf`` to alter how it handles installation by specifying - a default section in its config with custom locations. - """ - ctx.cluster.run( - args=[ - 'rm', '-f', '/home/ubuntu/.cephdeploy.conf' - ], - timeout=30 - ) - - -def undo_multipath(ctx): - """ - Undo any multipath device mappings created, an - remove the packages/daemon that manages them so they don't - come back unless specifically requested by the test. - """ - for remote in ctx.cluster.remotes.iterkeys(): - remote.run( - args=[ - 'sudo', 'multipath', '-F', - ], - check_status=False, - timeout=60 - ) - - -def synch_clocks(remotes): - for remote in remotes: - remote.run( - args=[ - 'sudo', 'service', 'ntp', 'stop', - run.Raw('&&'), - 'sudo', 'ntpdate-debian', - run.Raw('&&'), - 'sudo', 'hwclock', '--systohc', '--utc', - run.Raw('&&'), - 'sudo', 'service', 'ntp', 'start', - run.Raw('||'), - 'true', # ignore errors; we may be racing with ntpd startup - ], - timeout=60, - ) - - -def check_console(hostname): - shortname = orchestra.remote.getShortName(hostname) - console = orchestra.remote.getRemoteConsole( - name=hostname, - ipmiuser=config['ipmi_user'], - ipmipass=config['ipmi_password'], - ipmidomain=config['ipmi_domain']) - cname = '{host}.{domain}'.format( - host=shortname, - domain=config['ipmi_domain']) - log.info('checking console status of %s' % cname) - if console.check_status(): - log.info('console ready on %s' % cname) - return - if console.check_power('on'): - log.info('attempting to reboot %s' % cname) - console.power_cycle() - else: - log.info('attempting to power on %s' % cname) - console.power_on() - timeout = 100 - log.info('checking console status of %s with timeout %s' % - (cname, timeout)) - if console.check_status(timeout=timeout): - log.info('console ready on %s' % cname) - else: - log.error( - "Failed to get console status for %s, " % cname - ) - - -def stale_openstack(ctx): - targets = dict(map(lambda i: (i['ID'], i), - OpenStack.list_instances())) - nodes = list_locks(keyed_by_name=True, locked=True) - stale_openstack_instances(ctx, targets, nodes) - stale_openstack_nodes(ctx, targets, nodes) - stale_openstack_volumes(ctx, OpenStack.list_volumes()) - if not ctx.dry_run: - openstack_remove_again() - -# -# A delay, in seconds, that is significantly longer than -# any kind of OpenStack server creation / deletion / etc. -# -OPENSTACK_DELAY = 30 * 60 - - -def stale_openstack_instances(ctx, instances, locked_nodes): - for (instance_id, instance) in instances.iteritems(): - i = OpenStackInstance(instance_id) - if not i.exists(): - log.debug("stale-openstack: {instance} disappeared, ignored" - .format(instance=instance_id)) - continue - if (i.get_created() > - config['max_job_time'] + OPENSTACK_DELAY): - log.info( - "stale-openstack: destroying instance {instance}" - " because it was created {created} seconds ago" - " which is older than" - " max_job_time {max_job_time} + {delay}" - .format(instance=i['name'], - created=i.get_created(), - max_job_time=config['max_job_time'], - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - i.destroy() - continue - name = canonicalize_hostname(i['name'], user=None) - if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes: - log.info("stale-openstack: destroying instance {instance}" - " because it was created {created} seconds ago" - " is older than {delay}s and it is not locked" - .format(instance=i['name'], - created=i.get_created(), - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - i.destroy() - continue - log.debug("stale-openstack: instance " + i['name'] + " OK") - - -def openstack_delete_volume(id): - sh("openstack volume delete " + id + " || true") - - -def stale_openstack_volumes(ctx, volumes): - now = datetime.datetime.now() - for volume in volumes: - volume_id = volume.get('ID') or volume['id'] - try: - volume = json.loads(sh("openstack -q volume show -f json " + - volume_id)) - except subprocess.CalledProcessError: - log.debug("stale-openstack: {id} disappeared, ignored" - .format(id=volume_id)) - continue - volume_name = (volume.get('Display Name') or volume.get('display_name') - or volume['name']) - enforce_json_dictionary(volume) - created_at = datetime.datetime.strptime( - volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f') - created = (now - created_at).total_seconds() - if created > config['max_job_time'] + OPENSTACK_DELAY: - log.info( - "stale-openstack: destroying volume {volume}({id})" - " because it was created {created} seconds ago" - " which is older than" - " max_job_time {max_job_time} + {delay}" - .format(volume=volume_name, - id=volume_id, - created=created, - max_job_time=config['max_job_time'], - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - openstack_delete_volume(volume_id) - continue - log.debug("stale-openstack: volume " + volume_id + " OK") - - -def stale_openstack_nodes(ctx, instances, locked_nodes): - names = set([ i['Name'] for i in instances.values() ]) - for (name, node) in locked_nodes.iteritems(): - name = decanonicalize_hostname(name) - if node['machine_type'] != 'openstack': - continue - if (name not in names and - locked_since_seconds(node) > OPENSTACK_DELAY): - log.info("stale-openstack: unlocking node {name} unlocked" - " because it was created {created}" - " seconds ago which is older than {delay}" - " and it has no instance" - .format(name=name, - created=locked_since_seconds(node), - delay=OPENSTACK_DELAY)) - if not ctx.dry_run: - unlock_one(ctx, name, node['locked_by']) - continue - log.debug("stale-openstack: node " + name + " OK") - - -def openstack_remove_again(): - """ - Volumes and servers with REMOVE-ME in the name are leftover - that failed to be removed. It is not uncommon for a failed removal - to succeed later on. - """ - sh(""" - openstack server list --name REMOVE-ME --column ID --format value | - xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait - true - """) - sh(""" - openstack volume list --name REMOVE-ME --column ID --format value | - xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete - true - """) - - -def main(args): - ctx = FakeNamespace(args) - if ctx.verbose: - teuthology.log.setLevel(logging.DEBUG) - - info = {} - if ctx.archive: - ctx.config = config_file(ctx.archive + '/config.yaml') - ifn = os.path.join(ctx.archive, 'info.yaml') - if os.path.exists(ifn): - with file(ifn, 'r') as fd: - info = yaml.load(fd.read()) - if not ctx.pid: - ctx.pid = info.get('pid') - if not ctx.pid: - ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n')) - if not ctx.owner: - ctx.owner = info.get('owner') - if not ctx.owner: - ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n') - - if ctx.targets: - ctx.config = merge_configs(ctx.targets) - - if ctx.stale: - stale_nodes = find_stale_locks(ctx.owner) - targets = dict() - for node in stale_nodes: - targets[node['name']] = node['ssh_pub_key'] - ctx.config = dict(targets=targets) - - if ctx.stale_openstack: - stale_openstack(ctx) - return - - log.info( - '\n '.join( - ['targets:', ] + yaml.safe_dump( - ctx.config['targets'], - default_flow_style=False).splitlines())) - - if ctx.dry_run: - log.info("Not actually nuking anything since --dry-run was passed") - return - - if ctx.owner is None: - ctx.owner = get_user() - - if ctx.pid: - if ctx.archive: - log.info('Killing teuthology process at pid %d', ctx.pid) - os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % ( - ctx.archive, - ctx.pid, - ctx.pid)) - else: - subprocess.check_call(["kill", "-9", str(ctx.pid)]) - - nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi) - - -def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False): - if 'targets' not in ctx.config: - return - total_unnuked = {} - targets = dict(ctx.config['targets']) - if ctx.name: - log.info('Checking targets against current locks') - locks = list_locks() - # Remove targets who's description doesn't match archive name. - for lock in locks: - for target in targets: - if target == lock['name']: - if ctx.name not in lock['description']: - del ctx.config['targets'][lock['name']] - log.info( - "Not nuking %s because description doesn't match", - lock['name']) - with parallel() as p: - for target, hostkey in ctx.config['targets'].iteritems(): - p.spawn( - nuke_one, - ctx, - {target: hostkey}, - should_unlock, - sync_clocks, - reboot_all, - ctx.config.get('check-locks', True), - noipmi, - ) - for unnuked in p: - if unnuked: - total_unnuked.update(unnuked) - if total_unnuked: - log.error('Could not nuke the following targets:\n' + - '\n '.join(['targets:', ] + - yaml.safe_dump( - total_unnuked, - default_flow_style=False).splitlines())) - - -def nuke_one(ctx, target, should_unlock, synch_clocks, reboot_all, - check_locks, noipmi): - ret = None - ctx = argparse.Namespace( - config=dict(targets=target), - owner=ctx.owner, - check_locks=check_locks, - synch_clocks=synch_clocks, - reboot_all=reboot_all, - teuthology_config=config.to_dict(), - name=ctx.name, - noipmi=noipmi, - ) - try: - nuke_helper(ctx, should_unlock) - except Exception: - log.exception('Could not nuke %s' % target) - # not re-raising the so that parallel calls aren't killed - ret = target - else: - if should_unlock: - unlock_one(ctx, target.keys()[0], ctx.owner) - return ret - - -def nuke_helper(ctx, should_unlock): - # ensure node is up with ipmi - (target,) = ctx.config['targets'].keys() - host = target.split('@')[-1] - shortname = host.split('.')[0] - if should_unlock: - if 'vpm' in shortname: - return - status_info = get_status(host) - if status_info['is_vm'] and status_info['machine_type'] == 'openstack': - return - log.debug('shortname: %s' % shortname) - log.debug('{ctx}'.format(ctx=ctx)) - if (not ctx.noipmi and 'ipmi_user' in config and - 'vpm' not in shortname): - check_console(host) - - if ctx.check_locks: - # does not check to ensure if the node is 'up' - # we want to be able to nuke a downed node - check_lock(ctx, None, check_up=False) - add_remotes(ctx, None) - connect(ctx, None) - - log.info("Clearing teuthology firewall rules...") - clear_firewall(ctx) - log.info("Cleared teuthology firewall rules.") - - log.info('Unmount ceph-fuse and killing daemons...') - shutdown_daemons(ctx) - log.info('All daemons killed.') - # Try to remove packages before reboot - remove_installed_packages(ctx) - - remotes = ctx.cluster.remotes.keys() - reboot(ctx, remotes) - #shutdown daemons again incase of startup - log.info('Stop daemons after restart...') - shutdown_daemons(ctx) - log.info('All daemons killed.') - log.info('Unmount any osd data directories...') - remove_osd_mounts(ctx) - log.info('Unmount any osd tmpfs dirs...') - remove_osd_tmpfs(ctx) - log.info("Terminating Hadoop services...") - kill_hadoop(ctx) - log.info("Remove kernel mounts...") - remove_kernel_mounts(ctx) - - log.info("Force remove ceph packages") - remove_ceph_packages(ctx) - - log.info('Synchronizing clocks...') - synch_clocks(remotes) - - log.info('Making sure firmware.git is not locked...') - ctx.cluster.run(args=['sudo', 'rm', '-f', - '/lib/firmware/updates/.git/index.lock', ]) - - remove_configuration_files(ctx) - log.info('Removing any multipath config/pkgs...') - undo_multipath(ctx) - log.info('Resetting syslog output locations...') - reset_syslog_dir(ctx) - log.info('Clearing filesystem of test data...') - remove_testing_tree(ctx) - log.info('Filesystem cleared.') - remove_yum_timedhosts(ctx) - # Once again remove packages after reboot - remove_installed_packages(ctx) - log.info('Installed packages removed.') diff --git a/teuthology/nuke/__init__.py b/teuthology/nuke/__init__.py new file mode 100644 index 0000000000..165ed9aa41 --- /dev/null +++ b/teuthology/nuke/__init__.py @@ -0,0 +1,744 @@ +import argparse +import datetime +import json +import logging +import os +import subprocess +import time +import yaml + +import teuthology + +from ..config import config, FakeNamespace +from ..lock import ( + list_locks, locked_since_seconds, unlock_one, find_stale_locks +) +from ..lockstatus import get_status +from ..misc import ( + canonicalize_hostname, config_file, decanonicalize_hostname, merge_configs, + get_testdir, get_user, reconnect, sh +) +from ..openstack import OpenStack, OpenStackInstance, enforce_json_dictionary +from ..orchestra import run +from ..orchestra.remote import Remote, getRemoteConsole +from ..parallel import parallel +from ..task import install as install_task +from ..task.internal import check_lock, add_remotes, connect + +log = logging.getLogger(__name__) + + +def clear_firewall(ctx): + """ + Remove any iptables rules created by teuthology. These rules are + identified by containing a comment with 'teuthology' in it. Non-teuthology + firewall rules are unaffected. + """ + ctx.cluster.run( + args=[ + "sudo", "sh", "-c", + "iptables-save | grep -v teuthology | iptables-restore" + ], + ) + + +def shutdown_daemons(ctx): + ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), + 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), + 'sudo', 'systemctl', 'stop', 'ceph.target'], + check_status=False, timeout=180) + ctx.cluster.run( + args=[ + 'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'), + 'then', + 'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'), + 'grep', '-o', " /.* fuse", run.Raw('|'), + 'grep', '-o', "/.* ", run.Raw('|'), + 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), + 'fi', + run.Raw(';'), + 'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'), + 'then', + 'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'), + 'grep', '-o', " /.* fuse", run.Raw('|'), + 'grep', '-o', "/.* ", run.Raw('|'), + 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'), + 'fi', + run.Raw(';'), + 'sudo', + 'killall', + '--quiet', + 'ceph-mon', + 'ceph-osd', + 'ceph-mds', + 'ceph-fuse', + 'ceph-disk', + 'radosgw', + 'ceph_test_rados', + 'rados', + 'rbd-fuse', + 'apache2', + run.Raw('||'), + 'true', # ignore errors from ceph binaries not being found + ], + timeout=120, + ) + + +def kill_hadoop(ctx): + ctx.cluster.run(args=[ + "ps", "-ef", + run.Raw("|"), "grep", "java.*hadoop", + run.Raw("|"), "grep", "-v", "grep", + run.Raw("|"), 'awk', '{print $2}', + run.Raw("|"), 'xargs', 'kill', '-9', + ], check_status=False, timeout=60) + + +def remove_kernel_mounts(ctx): + """ + properly we should be able to just do a forced unmount, + but that doesn't seem to be working, so you should reboot instead + """ + log.info('clearing kernel mount from all nodes') + ctx.cluster.run( + args=[ + 'grep', 'ceph', '/etc/mtab', run.Raw('|'), + 'grep', '-o', "on /.* type", run.Raw('|'), + 'grep', '-o', "/.* ", run.Raw('|'), + 'xargs', '-r', + 'sudo', 'umount', '-f', run.Raw(';'), + ], + check_status=False, + timeout=60 + ) + + +def remove_osd_mounts(ctx): + """ + unmount any osd data mounts (scratch disks) + """ + ctx.cluster.run( + args=[ + 'grep', + '/var/lib/ceph/osd/', + '/etc/mtab', + run.Raw('|'), + 'awk', '{print $2}', run.Raw('|'), + 'xargs', '-r', + 'sudo', 'umount', run.Raw(';'), + 'true' + ], + timeout=120 + ) + + +def remove_osd_tmpfs(ctx): + """ + unmount tmpfs mounts + """ + ctx.cluster.run( + args=[ + 'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'), + 'awk', '{print $2}', run.Raw('|'), + 'xargs', '-r', + 'sudo', 'umount', run.Raw(';'), + 'true' + ], + timeout=120 + ) + + +def reboot(ctx, remotes): + nodes = {} + for remote in remotes: + log.info('rebooting %s', remote.name) + try: + proc = remote.run( + args=[ + 'sync', + run.Raw('&'), + 'sleep', '5', + run.Raw(';'), + 'sudo', 'reboot', + ], + wait=False, + ) + except Exception: + log.exception('ignoring exception during reboot command') + nodes[remote] = proc + # we just ignore these procs because reboot -f doesn't actually + # send anything back to the ssh client! + # for remote, proc in nodes.iteritems(): + # proc.wait() + if remotes: + log.info('waiting for nodes to reboot') + time.sleep(8) # if we try and reconnect too quickly, it succeeds! + reconnect(ctx, 480) # allow 8 minutes for the reboots + + +def reset_syslog_dir(ctx): + nodes = {} + for remote in ctx.cluster.remotes.iterkeys(): + proc = remote.run( + args=[ + 'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf', + run.Raw(';'), + 'then', + 'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf', + run.Raw('&&'), + 'sudo', 'service', 'rsyslog', 'restart', + run.Raw(';'), + 'fi', + run.Raw(';'), + ], + timeout=60, + ) + nodes[remote.name] = proc + + for name, proc in nodes.iteritems(): + log.info('Waiting for %s to restart syslog...', name) + proc.wait() + + +def dpkg_configure(ctx): + for remote in ctx.cluster.remotes.iterkeys(): + if remote.os.package_type != 'deb': + continue + log.info( + 'Waiting for dpkg --configure -a and apt-get -f install...') + remote.run( + args=[ + 'sudo', 'dpkg', '--configure', '-a', + run.Raw(';'), + 'sudo', 'DEBIAN_FRONTEND=noninteractive', + 'apt-get', '-y', '--force-yes', '-f', 'install', + run.Raw('||'), + ':', + ], + timeout=180, + check_status=False, + ) + + +def remove_yum_timedhosts(ctx): + # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329 + log.info("Removing yum timedhosts files...") + for remote in ctx.cluster.remotes.iterkeys(): + if remote.os.package_type != 'rpm': + continue + remote.run( + args="sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;", + check_status=False, timeout=180 + ) + + +def remove_ceph_packages(ctx): + """ + remove ceph and ceph dependent packages by force + force is needed since the node's repo might have changed and + in many cases autocorrect will not work due to missing packages + due to repo changes + """ + ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd', + 'libcephfs1', 'librados2', 'librgw2', 'librbd1', + 'ceph-selinux', 'python-cephfs', 'ceph-base', + 'python-rbd', 'python-rados', 'ceph-mds', + 'libcephfs-java', 'libcephfs-jni', + 'ceph-deploy', 'libapache2-mod-fastcgi' + ] + pkgs = str.join(' ', ceph_packages_to_remove) + for remote in ctx.cluster.remotes.iterkeys(): + if remote.os.package_type == 'rpm': + log.info("Remove any broken repos") + remote.run( + args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")], + check_status=False + ) + remote.run( + args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")], + check_status=False, + ) + remote.run( + args=['sudo', 'rpm', '--rebuilddb', run.Raw('&&'), 'yum', + 'clean', 'all'] + ) + log.info('Remove any ceph packages') + remote.run( + args=['sudo', 'yum', 'remove', '-y', run.Raw(pkgs)], + check_status=False + ) + else: + log.info("Remove any broken repos") + remote.run( + args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")], + check_status=False, + ) + log.info("Autoclean") + remote.run( + args=['sudo', 'apt-get', 'autoclean'], + check_status=False, + ) + log.info('Remove any ceph packages') + remote.run( + args=[ + 'sudo', 'dpkg', '--remove', '--force-remove-reinstreq', + run.Raw(pkgs) + ], + check_status=False + ) + log.info("Autoclean") + remote.run( + args=['sudo', 'apt-get', 'autoclean'] + ) + + +def remove_installed_packages(ctx): + dpkg_configure(ctx) + conf = dict( + project='ceph', + debuginfo='true', + ) + packages = install_task.get_package_list(ctx, conf) + debs = packages['deb'] + \ + ['salt-common', 'salt-minion', 'calamari-server', + 'python-rados', 'multipath-tools'] + rpms = packages['rpm'] + \ + ['salt-common', 'salt-minion', 'calamari-server', + 'multipath-tools', 'device-mapper-multipath'] + install_task.remove_packages( + ctx, + conf, + dict( + deb=debs, + rpm=rpms, + ) + ) + install_task.remove_sources(ctx, conf) + install_task.purge_data(ctx) + + +def remove_testing_tree(ctx): + ctx.cluster.run( + args=[ + 'sudo', 'rm', '-rf', get_testdir(ctx), + # just for old time's sake + run.Raw('&&'), + 'sudo', 'rm', '-rf', '/tmp/cephtest', + run.Raw('&&'), + 'sudo', 'rm', '-rf', '/home/ubuntu/cephtest', + run.Raw('&&'), + 'sudo', 'rm', '-rf', '/etc/ceph', + ], + ) + + +def remove_configuration_files(ctx): + """ + Goes through a list of commonly used configuration files used for testing + that should not be left behind. + + For example, sometimes ceph-deploy may be configured via + ``~/.cephdeploy.conf`` to alter how it handles installation by specifying + a default section in its config with custom locations. + """ + ctx.cluster.run( + args=[ + 'rm', '-f', '/home/ubuntu/.cephdeploy.conf' + ], + timeout=30 + ) + + +def undo_multipath(ctx): + """ + Undo any multipath device mappings created, an + remove the packages/daemon that manages them so they don't + come back unless specifically requested by the test. + """ + for remote in ctx.cluster.remotes.iterkeys(): + remote.run( + args=[ + 'sudo', 'multipath', '-F', + ], + check_status=False, + timeout=60 + ) + + +def synch_clocks(remotes): + for remote in remotes: + remote.run( + args=[ + 'sudo', 'service', 'ntp', 'stop', + run.Raw('&&'), + 'sudo', 'ntpdate-debian', + run.Raw('&&'), + 'sudo', 'hwclock', '--systohc', '--utc', + run.Raw('&&'), + 'sudo', 'service', 'ntp', 'start', + run.Raw('||'), + 'true', # ignore errors; we may be racing with ntpd startup + ], + timeout=60, + ) + + +def check_console(hostname): + remote = Remote(hostname) + shortname = remote.shortname + console = getRemoteConsole( + name=hostname, + ipmiuser=config['ipmi_user'], + ipmipass=config['ipmi_password'], + ipmidomain=config['ipmi_domain']) + cname = '{host}.{domain}'.format( + host=shortname, + domain=config['ipmi_domain']) + log.info('checking console status of %s' % cname) + if console.check_status(): + log.info('console ready on %s' % cname) + return + if console.check_power('on'): + log.info('attempting to reboot %s' % cname) + console.power_cycle() + else: + log.info('attempting to power on %s' % cname) + console.power_on() + timeout = 100 + log.info('checking console status of %s with timeout %s' % + (cname, timeout)) + if console.check_status(timeout=timeout): + log.info('console ready on %s' % cname) + else: + log.error( + "Failed to get console status for %s, " % cname + ) + + +def stale_openstack(ctx): + targets = dict(map(lambda i: (i['ID'], i), + OpenStack.list_instances())) + nodes = list_locks(keyed_by_name=True, locked=True) + stale_openstack_instances(ctx, targets, nodes) + stale_openstack_nodes(ctx, targets, nodes) + stale_openstack_volumes(ctx, OpenStack.list_volumes()) + if not ctx.dry_run: + openstack_remove_again() + +# +# A delay, in seconds, that is significantly longer than +# any kind of OpenStack server creation / deletion / etc. +# +OPENSTACK_DELAY = 30 * 60 + + +def stale_openstack_instances(ctx, instances, locked_nodes): + for (instance_id, instance) in instances.iteritems(): + i = OpenStackInstance(instance_id) + if not i.exists(): + log.debug("stale-openstack: {instance} disappeared, ignored" + .format(instance=instance_id)) + continue + if (i.get_created() > + config['max_job_time'] + OPENSTACK_DELAY): + log.info( + "stale-openstack: destroying instance {instance}" + " because it was created {created} seconds ago" + " which is older than" + " max_job_time {max_job_time} + {delay}" + .format(instance=i['name'], + created=i.get_created(), + max_job_time=config['max_job_time'], + delay=OPENSTACK_DELAY)) + if not ctx.dry_run: + i.destroy() + continue + name = canonicalize_hostname(i['name'], user=None) + if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes: + log.info("stale-openstack: destroying instance {instance}" + " because it was created {created} seconds ago" + " is older than {delay}s and it is not locked" + .format(instance=i['name'], + created=i.get_created(), + delay=OPENSTACK_DELAY)) + if not ctx.dry_run: + i.destroy() + continue + log.debug("stale-openstack: instance " + i['name'] + " OK") + + +def openstack_delete_volume(id): + sh("openstack volume delete " + id + " || true") + + +def stale_openstack_volumes(ctx, volumes): + now = datetime.datetime.now() + for volume in volumes: + volume_id = volume.get('ID') or volume['id'] + try: + volume = json.loads(sh("openstack -q volume show -f json " + + volume_id)) + except subprocess.CalledProcessError: + log.debug("stale-openstack: {id} disappeared, ignored" + .format(id=volume_id)) + continue + volume_name = (volume.get('Display Name') or volume.get('display_name') + or volume['name']) + enforce_json_dictionary(volume) + created_at = datetime.datetime.strptime( + volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f') + created = (now - created_at).total_seconds() + if created > config['max_job_time'] + OPENSTACK_DELAY: + log.info( + "stale-openstack: destroying volume {volume}({id})" + " because it was created {created} seconds ago" + " which is older than" + " max_job_time {max_job_time} + {delay}" + .format(volume=volume_name, + id=volume_id, + created=created, + max_job_time=config['max_job_time'], + delay=OPENSTACK_DELAY)) + if not ctx.dry_run: + openstack_delete_volume(volume_id) + continue + log.debug("stale-openstack: volume " + volume_id + " OK") + + +def stale_openstack_nodes(ctx, instances, locked_nodes): + names = set([ i['Name'] for i in instances.values() ]) + for (name, node) in locked_nodes.iteritems(): + name = decanonicalize_hostname(name) + if node['machine_type'] != 'openstack': + continue + if (name not in names and + locked_since_seconds(node) > OPENSTACK_DELAY): + log.info("stale-openstack: unlocking node {name} unlocked" + " because it was created {created}" + " seconds ago which is older than {delay}" + " and it has no instance" + .format(name=name, + created=locked_since_seconds(node), + delay=OPENSTACK_DELAY)) + if not ctx.dry_run: + unlock_one(ctx, name, node['locked_by']) + continue + log.debug("stale-openstack: node " + name + " OK") + + +def openstack_remove_again(): + """ + Volumes and servers with REMOVE-ME in the name are leftover + that failed to be removed. It is not uncommon for a failed removal + to succeed later on. + """ + sh(""" + openstack server list --name REMOVE-ME --column ID --format value | + xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait + true + """) + sh(""" + openstack volume list --name REMOVE-ME --column ID --format value | + xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete + true + """) + + +def main(args): + ctx = FakeNamespace(args) + if ctx.verbose: + teuthology.log.setLevel(logging.DEBUG) + + info = {} + if ctx.archive: + ctx.config = config_file(ctx.archive + '/config.yaml') + ifn = os.path.join(ctx.archive, 'info.yaml') + if os.path.exists(ifn): + with file(ifn, 'r') as fd: + info = yaml.load(fd.read()) + if not ctx.pid: + ctx.pid = info.get('pid') + if not ctx.pid: + ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n')) + if not ctx.owner: + ctx.owner = info.get('owner') + if not ctx.owner: + ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n') + + if ctx.targets: + ctx.config = merge_configs(ctx.targets) + + if ctx.stale: + stale_nodes = find_stale_locks(ctx.owner) + targets = dict() + for node in stale_nodes: + targets[node['name']] = node['ssh_pub_key'] + ctx.config = dict(targets=targets) + + if ctx.stale_openstack: + stale_openstack(ctx) + return + + log.info( + '\n '.join( + ['targets:', ] + yaml.safe_dump( + ctx.config['targets'], + default_flow_style=False).splitlines())) + + if ctx.dry_run: + log.info("Not actually nuking anything since --dry-run was passed") + return + + if ctx.owner is None: + ctx.owner = get_user() + + if ctx.pid: + if ctx.archive: + log.info('Killing teuthology process at pid %d', ctx.pid) + os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % ( + ctx.archive, + ctx.pid, + ctx.pid)) + else: + subprocess.check_call(["kill", "-9", str(ctx.pid)]) + + nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi) + + +def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False): + if 'targets' not in ctx.config: + return + total_unnuked = {} + targets = dict(ctx.config['targets']) + if ctx.name: + log.info('Checking targets against current locks') + locks = list_locks() + # Remove targets who's description doesn't match archive name. + for lock in locks: + for target in targets: + if target == lock['name']: + if ctx.name not in lock['description']: + del ctx.config['targets'][lock['name']] + log.info( + "Not nuking %s because description doesn't match", + lock['name']) + with parallel() as p: + for target, hostkey in ctx.config['targets'].iteritems(): + p.spawn( + nuke_one, + ctx, + {target: hostkey}, + should_unlock, + sync_clocks, + reboot_all, + ctx.config.get('check-locks', True), + noipmi, + ) + for unnuked in p: + if unnuked: + total_unnuked.update(unnuked) + if total_unnuked: + log.error('Could not nuke the following targets:\n' + + '\n '.join(['targets:', ] + + yaml.safe_dump( + total_unnuked, + default_flow_style=False).splitlines())) + + +def nuke_one(ctx, target, should_unlock, synch_clocks, reboot_all, + check_locks, noipmi): + ret = None + ctx = argparse.Namespace( + config=dict(targets=target), + owner=ctx.owner, + check_locks=check_locks, + synch_clocks=synch_clocks, + reboot_all=reboot_all, + teuthology_config=config.to_dict(), + name=ctx.name, + noipmi=noipmi, + ) + try: + nuke_helper(ctx, should_unlock) + except Exception: + log.exception('Could not nuke %s' % target) + # not re-raising the so that parallel calls aren't killed + ret = target + else: + if should_unlock: + unlock_one(ctx, target.keys()[0], ctx.owner) + return ret + + +def nuke_helper(ctx, should_unlock): + # ensure node is up with ipmi + (target,) = ctx.config['targets'].keys() + host = target.split('@')[-1] + shortname = host.split('.')[0] + if should_unlock: + if 'vpm' in shortname: + return + status_info = get_status(host) + if status_info['is_vm'] and status_info['machine_type'] == 'openstack': + return + log.debug('shortname: %s' % shortname) + log.debug('{ctx}'.format(ctx=ctx)) + if (not ctx.noipmi and 'ipmi_user' in config and + 'vpm' not in shortname): + check_console(host) + + if ctx.check_locks: + # does not check to ensure if the node is 'up' + # we want to be able to nuke a downed node + check_lock(ctx, None, check_up=False) + add_remotes(ctx, None) + connect(ctx, None) + + log.info("Clearing teuthology firewall rules...") + clear_firewall(ctx) + log.info("Cleared teuthology firewall rules.") + + log.info('Unmount ceph-fuse and killing daemons...') + shutdown_daemons(ctx) + log.info('All daemons killed.') + # Try to remove packages before reboot + remove_installed_packages(ctx) + + remotes = ctx.cluster.remotes.keys() + reboot(ctx, remotes) + # shutdown daemons again incase of startup + log.info('Stop daemons after restart...') + shutdown_daemons(ctx) + log.info('All daemons killed.') + log.info('Unmount any osd data directories...') + remove_osd_mounts(ctx) + log.info('Unmount any osd tmpfs dirs...') + remove_osd_tmpfs(ctx) + log.info("Terminating Hadoop services...") + kill_hadoop(ctx) + log.info("Remove kernel mounts...") + remove_kernel_mounts(ctx) + + log.info("Force remove ceph packages") + remove_ceph_packages(ctx) + + log.info('Synchronizing clocks...') + synch_clocks(remotes) + + log.info('Making sure firmware.git is not locked...') + ctx.cluster.run(args=['sudo', 'rm', '-f', + '/lib/firmware/updates/.git/index.lock', ]) + + remove_configuration_files(ctx) + log.info('Removing any multipath config/pkgs...') + undo_multipath(ctx) + log.info('Resetting syslog output locations...') + reset_syslog_dir(ctx) + log.info('Clearing filesystem of test data...') + remove_testing_tree(ctx) + log.info('Filesystem cleared.') + remove_yum_timedhosts(ctx) + # Once again remove packages after reboot + remove_installed_packages(ctx) + log.info('Installed packages removed.')