+++ /dev/null
-import argparse
-import datetime
-import json
-import logging
-import os
-import subprocess
-import time
-import yaml
-
-import teuthology
-from . import orchestra
-import orchestra.remote
-from .openstack import OpenStack, OpenStackInstance, enforce_json_dictionary
-from .orchestra import run
-from .config import config, FakeNamespace
-from .lock import list_locks
-from .lock import locked_since_seconds
-from .lock import unlock_one
-from .lock import find_stale_locks
-from .lockstatus import get_status
-from .misc import canonicalize_hostname
-from .misc import config_file
-from .misc import decanonicalize_hostname
-from .misc import merge_configs
-from .misc import get_testdir
-from .misc import get_user
-from .misc import reconnect
-from .misc import sh
-from .parallel import parallel
-from .task import install as install_task
-from .task.internal import check_lock, add_remotes, connect
-
-log = logging.getLogger(__name__)
-
-
-def clear_firewall(ctx):
- """
- Remove any iptables rules created by teuthology. These rules are
- identified by containing a comment with 'teuthology' in it. Non-teuthology
- firewall rules are unaffected.
- """
- ctx.cluster.run(
- args=[
- "sudo", "sh", "-c",
- "iptables-save | grep -v teuthology | iptables-restore"
- ],
- )
-
-
-def shutdown_daemons(ctx):
- ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
- 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
- 'sudo', 'systemctl', 'stop', 'ceph.target'],
- check_status=False, timeout=180)
- ctx.cluster.run(
- args=[
- 'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'),
- 'then',
- 'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'),
- 'grep', '-o', " /.* fuse", run.Raw('|'),
- 'grep', '-o', "/.* ", run.Raw('|'),
- 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
- 'fi',
- run.Raw(';'),
- 'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'),
- 'then',
- 'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'),
- 'grep', '-o', " /.* fuse", run.Raw('|'),
- 'grep', '-o', "/.* ", run.Raw('|'),
- 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
- 'fi',
- run.Raw(';'),
- 'sudo',
- 'killall',
- '--quiet',
- 'ceph-mon',
- 'ceph-osd',
- 'ceph-mds',
- 'ceph-fuse',
- 'ceph-disk',
- 'radosgw',
- 'ceph_test_rados',
- 'rados',
- 'rbd-fuse',
- 'apache2',
- run.Raw('||'),
- 'true', # ignore errors from ceph binaries not being found
- ],
- timeout=120,
- )
-
-
-def kill_hadoop(ctx):
- ctx.cluster.run(args=[
- "ps", "-ef",
- run.Raw("|"), "grep", "java.*hadoop",
- run.Raw("|"), "grep", "-v", "grep",
- run.Raw("|"), 'awk', '{print $2}',
- run.Raw("|"), 'xargs', 'kill', '-9',
- ], check_status=False, timeout=60)
-
-
-def remove_kernel_mounts(ctx):
- """
- properly we should be able to just do a forced unmount,
- but that doesn't seem to be working, so you should reboot instead
- """
- log.info('clearing kernel mount from all nodes')
- ctx.cluster.run(
- args=[
- 'grep', 'ceph', '/etc/mtab', run.Raw('|'),
- 'grep', '-o', "on /.* type", run.Raw('|'),
- 'grep', '-o', "/.* ", run.Raw('|'),
- 'xargs', '-r',
- 'sudo', 'umount', '-f', run.Raw(';'),
- ],
- check_status=False,
- timeout=60
- )
-
-
-def remove_osd_mounts(ctx):
- """
- unmount any osd data mounts (scratch disks)
- """
- ctx.cluster.run(
- args=[
- 'grep',
- '/var/lib/ceph/osd/',
- '/etc/mtab',
- run.Raw('|'),
- 'awk', '{print $2}', run.Raw('|'),
- 'xargs', '-r',
- 'sudo', 'umount', run.Raw(';'),
- 'true'
- ],
- timeout=120
- )
-
-
-def remove_osd_tmpfs(ctx):
- """
- unmount tmpfs mounts
- """
- ctx.cluster.run(
- args=[
- 'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'),
- 'awk', '{print $2}', run.Raw('|'),
- 'xargs', '-r',
- 'sudo', 'umount', run.Raw(';'),
- 'true'
- ],
- timeout=120
- )
-
-
-def reboot(ctx, remotes):
- nodes = {}
- for remote in remotes:
- log.info('rebooting %s', remote.name)
- try:
- proc = remote.run(
- args=[
- 'sync',
- run.Raw('&'),
- 'sleep', '5',
- run.Raw(';'),
- 'sudo', 'reboot',
- ],
- wait=False,
- )
- except Exception:
- log.exception('ignoring exception during reboot command')
- nodes[remote] = proc
- # we just ignore these procs because reboot -f doesn't actually
- # send anything back to the ssh client!
- # for remote, proc in nodes.iteritems():
- # proc.wait()
- if remotes:
- log.info('waiting for nodes to reboot')
- time.sleep(8) # if we try and reconnect too quickly, it succeeds!
- reconnect(ctx, 480) # allow 8 minutes for the reboots
-
-
-def reset_syslog_dir(ctx):
- nodes = {}
- for remote in ctx.cluster.remotes.iterkeys():
- proc = remote.run(
- args=[
- 'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
- run.Raw(';'),
- 'then',
- 'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
- run.Raw('&&'),
- 'sudo', 'service', 'rsyslog', 'restart',
- run.Raw(';'),
- 'fi',
- run.Raw(';'),
- ],
- timeout=60,
- )
- nodes[remote.name] = proc
-
- for name, proc in nodes.iteritems():
- log.info('Waiting for %s to restart syslog...', name)
- proc.wait()
-
-
-def dpkg_configure(ctx):
- for remote in ctx.cluster.remotes.iterkeys():
- if remote.os.package_type != 'deb':
- continue
- log.info(
- 'Waiting for dpkg --configure -a and apt-get -f install...')
- remote.run(
- args=[
- 'sudo', 'dpkg', '--configure', '-a',
- run.Raw(';'),
- 'sudo', 'DEBIAN_FRONTEND=noninteractive',
- 'apt-get', '-y', '--force-yes', '-f', 'install',
- run.Raw('||'),
- ':',
- ],
- timeout=180,
- check_status=False,
- )
-
-
-def remove_yum_timedhosts(ctx):
- # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329
- log.info("Removing yum timedhosts files...")
- for remote in ctx.cluster.remotes.iterkeys():
- if remote.os.package_type != 'rpm':
- continue
- remote.run(
- args="sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;",
- check_status=False, timeout=180
- )
-
-
-def remove_ceph_packages(ctx):
- """
- remove ceph and ceph dependent packages by force
- force is needed since the node's repo might have changed and
- in many cases autocorrect will not work due to missing packages
- due to repo changes
- """
- ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd',
- 'libcephfs1', 'librados2', 'librgw2', 'librbd1',
- 'ceph-selinux', 'python-cephfs', 'ceph-base',
- 'python-rbd', 'python-rados', 'ceph-mds',
- 'libcephfs-java', 'libcephfs-jni',
- 'ceph-deploy', 'libapache2-mod-fastcgi'
- ]
- pkgs = str.join(' ', ceph_packages_to_remove)
- for remote in ctx.cluster.remotes.iterkeys():
- if remote.os.package_type == 'rpm':
- log.info("Remove any broken repos")
- remote.run(
- args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")],
- check_status=False
- )
- remote.run(
- args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")],
- check_status=False,
- )
- remote.run(
- args=['sudo', 'rpm', '--rebuilddb', run.Raw('&&'), 'yum',
- 'clean', 'all']
- )
- log.info('Remove any ceph packages')
- remote.run(
- args=['sudo', 'yum', 'remove', '-y', run.Raw(pkgs)],
- check_status=False
- )
- else:
- log.info("Remove any broken repos")
- remote.run(
- args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")],
- check_status=False,
- )
- log.info("Autoclean")
- remote.run(
- args=['sudo', 'apt-get', 'autoclean'],
- check_status=False,
- )
- log.info('Remove any ceph packages')
- remote.run(
- args=[
- 'sudo', 'dpkg', '--remove', '--force-remove-reinstreq',
- run.Raw(pkgs)
- ],
- check_status=False
- )
- log.info("Autoclean")
- remote.run(
- args=['sudo', 'apt-get', 'autoclean']
- )
-
-
-def remove_installed_packages(ctx):
- dpkg_configure(ctx)
- conf = dict(
- project='ceph',
- debuginfo='true',
- )
- packages = install_task.get_package_list(ctx, conf)
- debs = packages['deb'] + \
- ['salt-common', 'salt-minion', 'calamari-server',
- 'python-rados', 'multipath-tools']
- rpms = packages['rpm'] + \
- ['salt-common', 'salt-minion', 'calamari-server',
- 'multipath-tools', 'device-mapper-multipath']
- install_task.remove_packages(
- ctx,
- conf,
- dict(
- deb=debs,
- rpm=rpms,
- )
- )
- install_task.remove_sources(ctx, conf)
- install_task.purge_data(ctx)
-
-
-def remove_testing_tree(ctx):
- ctx.cluster.run(
- args=[
- 'sudo', 'rm', '-rf', get_testdir(ctx),
- # just for old time's sake
- run.Raw('&&'),
- 'sudo', 'rm', '-rf', '/tmp/cephtest',
- run.Raw('&&'),
- 'sudo', 'rm', '-rf', '/home/ubuntu/cephtest',
- run.Raw('&&'),
- 'sudo', 'rm', '-rf', '/etc/ceph',
- ],
- )
-
-
-def remove_configuration_files(ctx):
- """
- Goes through a list of commonly used configuration files used for testing
- that should not be left behind.
-
- For example, sometimes ceph-deploy may be configured via
- ``~/.cephdeploy.conf`` to alter how it handles installation by specifying
- a default section in its config with custom locations.
- """
- ctx.cluster.run(
- args=[
- 'rm', '-f', '/home/ubuntu/.cephdeploy.conf'
- ],
- timeout=30
- )
-
-
-def undo_multipath(ctx):
- """
- Undo any multipath device mappings created, an
- remove the packages/daemon that manages them so they don't
- come back unless specifically requested by the test.
- """
- for remote in ctx.cluster.remotes.iterkeys():
- remote.run(
- args=[
- 'sudo', 'multipath', '-F',
- ],
- check_status=False,
- timeout=60
- )
-
-
-def synch_clocks(remotes):
- for remote in remotes:
- remote.run(
- args=[
- 'sudo', 'service', 'ntp', 'stop',
- run.Raw('&&'),
- 'sudo', 'ntpdate-debian',
- run.Raw('&&'),
- 'sudo', 'hwclock', '--systohc', '--utc',
- run.Raw('&&'),
- 'sudo', 'service', 'ntp', 'start',
- run.Raw('||'),
- 'true', # ignore errors; we may be racing with ntpd startup
- ],
- timeout=60,
- )
-
-
-def check_console(hostname):
- shortname = orchestra.remote.getShortName(hostname)
- console = orchestra.remote.getRemoteConsole(
- name=hostname,
- ipmiuser=config['ipmi_user'],
- ipmipass=config['ipmi_password'],
- ipmidomain=config['ipmi_domain'])
- cname = '{host}.{domain}'.format(
- host=shortname,
- domain=config['ipmi_domain'])
- log.info('checking console status of %s' % cname)
- if console.check_status():
- log.info('console ready on %s' % cname)
- return
- if console.check_power('on'):
- log.info('attempting to reboot %s' % cname)
- console.power_cycle()
- else:
- log.info('attempting to power on %s' % cname)
- console.power_on()
- timeout = 100
- log.info('checking console status of %s with timeout %s' %
- (cname, timeout))
- if console.check_status(timeout=timeout):
- log.info('console ready on %s' % cname)
- else:
- log.error(
- "Failed to get console status for %s, " % cname
- )
-
-
-def stale_openstack(ctx):
- targets = dict(map(lambda i: (i['ID'], i),
- OpenStack.list_instances()))
- nodes = list_locks(keyed_by_name=True, locked=True)
- stale_openstack_instances(ctx, targets, nodes)
- stale_openstack_nodes(ctx, targets, nodes)
- stale_openstack_volumes(ctx, OpenStack.list_volumes())
- if not ctx.dry_run:
- openstack_remove_again()
-
-#
-# A delay, in seconds, that is significantly longer than
-# any kind of OpenStack server creation / deletion / etc.
-#
-OPENSTACK_DELAY = 30 * 60
-
-
-def stale_openstack_instances(ctx, instances, locked_nodes):
- for (instance_id, instance) in instances.iteritems():
- i = OpenStackInstance(instance_id)
- if not i.exists():
- log.debug("stale-openstack: {instance} disappeared, ignored"
- .format(instance=instance_id))
- continue
- if (i.get_created() >
- config['max_job_time'] + OPENSTACK_DELAY):
- log.info(
- "stale-openstack: destroying instance {instance}"
- " because it was created {created} seconds ago"
- " which is older than"
- " max_job_time {max_job_time} + {delay}"
- .format(instance=i['name'],
- created=i.get_created(),
- max_job_time=config['max_job_time'],
- delay=OPENSTACK_DELAY))
- if not ctx.dry_run:
- i.destroy()
- continue
- name = canonicalize_hostname(i['name'], user=None)
- if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes:
- log.info("stale-openstack: destroying instance {instance}"
- " because it was created {created} seconds ago"
- " is older than {delay}s and it is not locked"
- .format(instance=i['name'],
- created=i.get_created(),
- delay=OPENSTACK_DELAY))
- if not ctx.dry_run:
- i.destroy()
- continue
- log.debug("stale-openstack: instance " + i['name'] + " OK")
-
-
-def openstack_delete_volume(id):
- sh("openstack volume delete " + id + " || true")
-
-
-def stale_openstack_volumes(ctx, volumes):
- now = datetime.datetime.now()
- for volume in volumes:
- volume_id = volume.get('ID') or volume['id']
- try:
- volume = json.loads(sh("openstack -q volume show -f json " +
- volume_id))
- except subprocess.CalledProcessError:
- log.debug("stale-openstack: {id} disappeared, ignored"
- .format(id=volume_id))
- continue
- volume_name = (volume.get('Display Name') or volume.get('display_name')
- or volume['name'])
- enforce_json_dictionary(volume)
- created_at = datetime.datetime.strptime(
- volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f')
- created = (now - created_at).total_seconds()
- if created > config['max_job_time'] + OPENSTACK_DELAY:
- log.info(
- "stale-openstack: destroying volume {volume}({id})"
- " because it was created {created} seconds ago"
- " which is older than"
- " max_job_time {max_job_time} + {delay}"
- .format(volume=volume_name,
- id=volume_id,
- created=created,
- max_job_time=config['max_job_time'],
- delay=OPENSTACK_DELAY))
- if not ctx.dry_run:
- openstack_delete_volume(volume_id)
- continue
- log.debug("stale-openstack: volume " + volume_id + " OK")
-
-
-def stale_openstack_nodes(ctx, instances, locked_nodes):
- names = set([ i['Name'] for i in instances.values() ])
- for (name, node) in locked_nodes.iteritems():
- name = decanonicalize_hostname(name)
- if node['machine_type'] != 'openstack':
- continue
- if (name not in names and
- locked_since_seconds(node) > OPENSTACK_DELAY):
- log.info("stale-openstack: unlocking node {name} unlocked"
- " because it was created {created}"
- " seconds ago which is older than {delay}"
- " and it has no instance"
- .format(name=name,
- created=locked_since_seconds(node),
- delay=OPENSTACK_DELAY))
- if not ctx.dry_run:
- unlock_one(ctx, name, node['locked_by'])
- continue
- log.debug("stale-openstack: node " + name + " OK")
-
-
-def openstack_remove_again():
- """
- Volumes and servers with REMOVE-ME in the name are leftover
- that failed to be removed. It is not uncommon for a failed removal
- to succeed later on.
- """
- sh("""
- openstack server list --name REMOVE-ME --column ID --format value |
- xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait
- true
- """)
- sh("""
- openstack volume list --name REMOVE-ME --column ID --format value |
- xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete
- true
- """)
-
-
-def main(args):
- ctx = FakeNamespace(args)
- if ctx.verbose:
- teuthology.log.setLevel(logging.DEBUG)
-
- info = {}
- if ctx.archive:
- ctx.config = config_file(ctx.archive + '/config.yaml')
- ifn = os.path.join(ctx.archive, 'info.yaml')
- if os.path.exists(ifn):
- with file(ifn, 'r') as fd:
- info = yaml.load(fd.read())
- if not ctx.pid:
- ctx.pid = info.get('pid')
- if not ctx.pid:
- ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n'))
- if not ctx.owner:
- ctx.owner = info.get('owner')
- if not ctx.owner:
- ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n')
-
- if ctx.targets:
- ctx.config = merge_configs(ctx.targets)
-
- if ctx.stale:
- stale_nodes = find_stale_locks(ctx.owner)
- targets = dict()
- for node in stale_nodes:
- targets[node['name']] = node['ssh_pub_key']
- ctx.config = dict(targets=targets)
-
- if ctx.stale_openstack:
- stale_openstack(ctx)
- return
-
- log.info(
- '\n '.join(
- ['targets:', ] + yaml.safe_dump(
- ctx.config['targets'],
- default_flow_style=False).splitlines()))
-
- if ctx.dry_run:
- log.info("Not actually nuking anything since --dry-run was passed")
- return
-
- if ctx.owner is None:
- ctx.owner = get_user()
-
- if ctx.pid:
- if ctx.archive:
- log.info('Killing teuthology process at pid %d', ctx.pid)
- os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % (
- ctx.archive,
- ctx.pid,
- ctx.pid))
- else:
- subprocess.check_call(["kill", "-9", str(ctx.pid)])
-
- nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi)
-
-
-def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False):
- if 'targets' not in ctx.config:
- return
- total_unnuked = {}
- targets = dict(ctx.config['targets'])
- if ctx.name:
- log.info('Checking targets against current locks')
- locks = list_locks()
- # Remove targets who's description doesn't match archive name.
- for lock in locks:
- for target in targets:
- if target == lock['name']:
- if ctx.name not in lock['description']:
- del ctx.config['targets'][lock['name']]
- log.info(
- "Not nuking %s because description doesn't match",
- lock['name'])
- with parallel() as p:
- for target, hostkey in ctx.config['targets'].iteritems():
- p.spawn(
- nuke_one,
- ctx,
- {target: hostkey},
- should_unlock,
- sync_clocks,
- reboot_all,
- ctx.config.get('check-locks', True),
- noipmi,
- )
- for unnuked in p:
- if unnuked:
- total_unnuked.update(unnuked)
- if total_unnuked:
- log.error('Could not nuke the following targets:\n' +
- '\n '.join(['targets:', ] +
- yaml.safe_dump(
- total_unnuked,
- default_flow_style=False).splitlines()))
-
-
-def nuke_one(ctx, target, should_unlock, synch_clocks, reboot_all,
- check_locks, noipmi):
- ret = None
- ctx = argparse.Namespace(
- config=dict(targets=target),
- owner=ctx.owner,
- check_locks=check_locks,
- synch_clocks=synch_clocks,
- reboot_all=reboot_all,
- teuthology_config=config.to_dict(),
- name=ctx.name,
- noipmi=noipmi,
- )
- try:
- nuke_helper(ctx, should_unlock)
- except Exception:
- log.exception('Could not nuke %s' % target)
- # not re-raising the so that parallel calls aren't killed
- ret = target
- else:
- if should_unlock:
- unlock_one(ctx, target.keys()[0], ctx.owner)
- return ret
-
-
-def nuke_helper(ctx, should_unlock):
- # ensure node is up with ipmi
- (target,) = ctx.config['targets'].keys()
- host = target.split('@')[-1]
- shortname = host.split('.')[0]
- if should_unlock:
- if 'vpm' in shortname:
- return
- status_info = get_status(host)
- if status_info['is_vm'] and status_info['machine_type'] == 'openstack':
- return
- log.debug('shortname: %s' % shortname)
- log.debug('{ctx}'.format(ctx=ctx))
- if (not ctx.noipmi and 'ipmi_user' in config and
- 'vpm' not in shortname):
- check_console(host)
-
- if ctx.check_locks:
- # does not check to ensure if the node is 'up'
- # we want to be able to nuke a downed node
- check_lock(ctx, None, check_up=False)
- add_remotes(ctx, None)
- connect(ctx, None)
-
- log.info("Clearing teuthology firewall rules...")
- clear_firewall(ctx)
- log.info("Cleared teuthology firewall rules.")
-
- log.info('Unmount ceph-fuse and killing daemons...')
- shutdown_daemons(ctx)
- log.info('All daemons killed.')
- # Try to remove packages before reboot
- remove_installed_packages(ctx)
-
- remotes = ctx.cluster.remotes.keys()
- reboot(ctx, remotes)
- #shutdown daemons again incase of startup
- log.info('Stop daemons after restart...')
- shutdown_daemons(ctx)
- log.info('All daemons killed.')
- log.info('Unmount any osd data directories...')
- remove_osd_mounts(ctx)
- log.info('Unmount any osd tmpfs dirs...')
- remove_osd_tmpfs(ctx)
- log.info("Terminating Hadoop services...")
- kill_hadoop(ctx)
- log.info("Remove kernel mounts...")
- remove_kernel_mounts(ctx)
-
- log.info("Force remove ceph packages")
- remove_ceph_packages(ctx)
-
- log.info('Synchronizing clocks...')
- synch_clocks(remotes)
-
- log.info('Making sure firmware.git is not locked...')
- ctx.cluster.run(args=['sudo', 'rm', '-f',
- '/lib/firmware/updates/.git/index.lock', ])
-
- remove_configuration_files(ctx)
- log.info('Removing any multipath config/pkgs...')
- undo_multipath(ctx)
- log.info('Resetting syslog output locations...')
- reset_syslog_dir(ctx)
- log.info('Clearing filesystem of test data...')
- remove_testing_tree(ctx)
- log.info('Filesystem cleared.')
- remove_yum_timedhosts(ctx)
- # Once again remove packages after reboot
- remove_installed_packages(ctx)
- log.info('Installed packages removed.')
--- /dev/null
+import argparse
+import datetime
+import json
+import logging
+import os
+import subprocess
+import time
+import yaml
+
+import teuthology
+
+from ..config import config, FakeNamespace
+from ..lock import (
+ list_locks, locked_since_seconds, unlock_one, find_stale_locks
+)
+from ..lockstatus import get_status
+from ..misc import (
+ canonicalize_hostname, config_file, decanonicalize_hostname, merge_configs,
+ get_testdir, get_user, reconnect, sh
+)
+from ..openstack import OpenStack, OpenStackInstance, enforce_json_dictionary
+from ..orchestra import run
+from ..orchestra.remote import Remote, getRemoteConsole
+from ..parallel import parallel
+from ..task import install as install_task
+from ..task.internal import check_lock, add_remotes, connect
+
+log = logging.getLogger(__name__)
+
+
+def clear_firewall(ctx):
+ """
+ Remove any iptables rules created by teuthology. These rules are
+ identified by containing a comment with 'teuthology' in it. Non-teuthology
+ firewall rules are unaffected.
+ """
+ ctx.cluster.run(
+ args=[
+ "sudo", "sh", "-c",
+ "iptables-save | grep -v teuthology | iptables-restore"
+ ],
+ )
+
+
+def shutdown_daemons(ctx):
+ ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+ 'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
+ 'sudo', 'systemctl', 'stop', 'ceph.target'],
+ check_status=False, timeout=180)
+ ctx.cluster.run(
+ args=[
+ 'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'),
+ 'then',
+ 'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'),
+ 'grep', '-o', " /.* fuse", run.Raw('|'),
+ 'grep', '-o', "/.* ", run.Raw('|'),
+ 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
+ 'fi',
+ run.Raw(';'),
+ 'if', 'grep', '-q', 'rbd-fuse', '/etc/mtab', run.Raw(';'),
+ 'then',
+ 'grep', 'rbd-fuse', '/etc/mtab', run.Raw('|'),
+ 'grep', '-o', " /.* fuse", run.Raw('|'),
+ 'grep', '-o', "/.* ", run.Raw('|'),
+ 'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
+ 'fi',
+ run.Raw(';'),
+ 'sudo',
+ 'killall',
+ '--quiet',
+ 'ceph-mon',
+ 'ceph-osd',
+ 'ceph-mds',
+ 'ceph-fuse',
+ 'ceph-disk',
+ 'radosgw',
+ 'ceph_test_rados',
+ 'rados',
+ 'rbd-fuse',
+ 'apache2',
+ run.Raw('||'),
+ 'true', # ignore errors from ceph binaries not being found
+ ],
+ timeout=120,
+ )
+
+
+def kill_hadoop(ctx):
+ ctx.cluster.run(args=[
+ "ps", "-ef",
+ run.Raw("|"), "grep", "java.*hadoop",
+ run.Raw("|"), "grep", "-v", "grep",
+ run.Raw("|"), 'awk', '{print $2}',
+ run.Raw("|"), 'xargs', 'kill', '-9',
+ ], check_status=False, timeout=60)
+
+
+def remove_kernel_mounts(ctx):
+ """
+ properly we should be able to just do a forced unmount,
+ but that doesn't seem to be working, so you should reboot instead
+ """
+ log.info('clearing kernel mount from all nodes')
+ ctx.cluster.run(
+ args=[
+ 'grep', 'ceph', '/etc/mtab', run.Raw('|'),
+ 'grep', '-o', "on /.* type", run.Raw('|'),
+ 'grep', '-o', "/.* ", run.Raw('|'),
+ 'xargs', '-r',
+ 'sudo', 'umount', '-f', run.Raw(';'),
+ ],
+ check_status=False,
+ timeout=60
+ )
+
+
+def remove_osd_mounts(ctx):
+ """
+ unmount any osd data mounts (scratch disks)
+ """
+ ctx.cluster.run(
+ args=[
+ 'grep',
+ '/var/lib/ceph/osd/',
+ '/etc/mtab',
+ run.Raw('|'),
+ 'awk', '{print $2}', run.Raw('|'),
+ 'xargs', '-r',
+ 'sudo', 'umount', run.Raw(';'),
+ 'true'
+ ],
+ timeout=120
+ )
+
+
+def remove_osd_tmpfs(ctx):
+ """
+ unmount tmpfs mounts
+ """
+ ctx.cluster.run(
+ args=[
+ 'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'),
+ 'awk', '{print $2}', run.Raw('|'),
+ 'xargs', '-r',
+ 'sudo', 'umount', run.Raw(';'),
+ 'true'
+ ],
+ timeout=120
+ )
+
+
+def reboot(ctx, remotes):
+ nodes = {}
+ for remote in remotes:
+ log.info('rebooting %s', remote.name)
+ try:
+ proc = remote.run(
+ args=[
+ 'sync',
+ run.Raw('&'),
+ 'sleep', '5',
+ run.Raw(';'),
+ 'sudo', 'reboot',
+ ],
+ wait=False,
+ )
+ except Exception:
+ log.exception('ignoring exception during reboot command')
+ nodes[remote] = proc
+ # we just ignore these procs because reboot -f doesn't actually
+ # send anything back to the ssh client!
+ # for remote, proc in nodes.iteritems():
+ # proc.wait()
+ if remotes:
+ log.info('waiting for nodes to reboot')
+ time.sleep(8) # if we try and reconnect too quickly, it succeeds!
+ reconnect(ctx, 480) # allow 8 minutes for the reboots
+
+
+def reset_syslog_dir(ctx):
+ nodes = {}
+ for remote in ctx.cluster.remotes.iterkeys():
+ proc = remote.run(
+ args=[
+ 'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
+ run.Raw(';'),
+ 'then',
+ 'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
+ run.Raw('&&'),
+ 'sudo', 'service', 'rsyslog', 'restart',
+ run.Raw(';'),
+ 'fi',
+ run.Raw(';'),
+ ],
+ timeout=60,
+ )
+ nodes[remote.name] = proc
+
+ for name, proc in nodes.iteritems():
+ log.info('Waiting for %s to restart syslog...', name)
+ proc.wait()
+
+
+def dpkg_configure(ctx):
+ for remote in ctx.cluster.remotes.iterkeys():
+ if remote.os.package_type != 'deb':
+ continue
+ log.info(
+ 'Waiting for dpkg --configure -a and apt-get -f install...')
+ remote.run(
+ args=[
+ 'sudo', 'dpkg', '--configure', '-a',
+ run.Raw(';'),
+ 'sudo', 'DEBIAN_FRONTEND=noninteractive',
+ 'apt-get', '-y', '--force-yes', '-f', 'install',
+ run.Raw('||'),
+ ':',
+ ],
+ timeout=180,
+ check_status=False,
+ )
+
+
+def remove_yum_timedhosts(ctx):
+ # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1233329
+ log.info("Removing yum timedhosts files...")
+ for remote in ctx.cluster.remotes.iterkeys():
+ if remote.os.package_type != 'rpm':
+ continue
+ remote.run(
+ args="sudo find /var/cache/yum -name 'timedhosts' -exec rm {} \;",
+ check_status=False, timeout=180
+ )
+
+
+def remove_ceph_packages(ctx):
+ """
+ remove ceph and ceph dependent packages by force
+ force is needed since the node's repo might have changed and
+ in many cases autocorrect will not work due to missing packages
+ due to repo changes
+ """
+ ceph_packages_to_remove = ['ceph-common', 'ceph-mon', 'ceph-osd',
+ 'libcephfs1', 'librados2', 'librgw2', 'librbd1',
+ 'ceph-selinux', 'python-cephfs', 'ceph-base',
+ 'python-rbd', 'python-rados', 'ceph-mds',
+ 'libcephfs-java', 'libcephfs-jni',
+ 'ceph-deploy', 'libapache2-mod-fastcgi'
+ ]
+ pkgs = str.join(' ', ceph_packages_to_remove)
+ for remote in ctx.cluster.remotes.iterkeys():
+ if remote.os.package_type == 'rpm':
+ log.info("Remove any broken repos")
+ remote.run(
+ args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*ceph*")],
+ check_status=False
+ )
+ remote.run(
+ args=['sudo', 'rm', run.Raw("/etc/yum.repos.d/*fcgi*")],
+ check_status=False,
+ )
+ remote.run(
+ args=['sudo', 'rpm', '--rebuilddb', run.Raw('&&'), 'yum',
+ 'clean', 'all']
+ )
+ log.info('Remove any ceph packages')
+ remote.run(
+ args=['sudo', 'yum', 'remove', '-y', run.Raw(pkgs)],
+ check_status=False
+ )
+ else:
+ log.info("Remove any broken repos")
+ remote.run(
+ args=['sudo', 'rm', run.Raw("/etc/apt/sources.list.d/*ceph*")],
+ check_status=False,
+ )
+ log.info("Autoclean")
+ remote.run(
+ args=['sudo', 'apt-get', 'autoclean'],
+ check_status=False,
+ )
+ log.info('Remove any ceph packages')
+ remote.run(
+ args=[
+ 'sudo', 'dpkg', '--remove', '--force-remove-reinstreq',
+ run.Raw(pkgs)
+ ],
+ check_status=False
+ )
+ log.info("Autoclean")
+ remote.run(
+ args=['sudo', 'apt-get', 'autoclean']
+ )
+
+
+def remove_installed_packages(ctx):
+ dpkg_configure(ctx)
+ conf = dict(
+ project='ceph',
+ debuginfo='true',
+ )
+ packages = install_task.get_package_list(ctx, conf)
+ debs = packages['deb'] + \
+ ['salt-common', 'salt-minion', 'calamari-server',
+ 'python-rados', 'multipath-tools']
+ rpms = packages['rpm'] + \
+ ['salt-common', 'salt-minion', 'calamari-server',
+ 'multipath-tools', 'device-mapper-multipath']
+ install_task.remove_packages(
+ ctx,
+ conf,
+ dict(
+ deb=debs,
+ rpm=rpms,
+ )
+ )
+ install_task.remove_sources(ctx, conf)
+ install_task.purge_data(ctx)
+
+
+def remove_testing_tree(ctx):
+ ctx.cluster.run(
+ args=[
+ 'sudo', 'rm', '-rf', get_testdir(ctx),
+ # just for old time's sake
+ run.Raw('&&'),
+ 'sudo', 'rm', '-rf', '/tmp/cephtest',
+ run.Raw('&&'),
+ 'sudo', 'rm', '-rf', '/home/ubuntu/cephtest',
+ run.Raw('&&'),
+ 'sudo', 'rm', '-rf', '/etc/ceph',
+ ],
+ )
+
+
+def remove_configuration_files(ctx):
+ """
+ Goes through a list of commonly used configuration files used for testing
+ that should not be left behind.
+
+ For example, sometimes ceph-deploy may be configured via
+ ``~/.cephdeploy.conf`` to alter how it handles installation by specifying
+ a default section in its config with custom locations.
+ """
+ ctx.cluster.run(
+ args=[
+ 'rm', '-f', '/home/ubuntu/.cephdeploy.conf'
+ ],
+ timeout=30
+ )
+
+
+def undo_multipath(ctx):
+ """
+ Undo any multipath device mappings created, an
+ remove the packages/daemon that manages them so they don't
+ come back unless specifically requested by the test.
+ """
+ for remote in ctx.cluster.remotes.iterkeys():
+ remote.run(
+ args=[
+ 'sudo', 'multipath', '-F',
+ ],
+ check_status=False,
+ timeout=60
+ )
+
+
+def synch_clocks(remotes):
+ for remote in remotes:
+ remote.run(
+ args=[
+ 'sudo', 'service', 'ntp', 'stop',
+ run.Raw('&&'),
+ 'sudo', 'ntpdate-debian',
+ run.Raw('&&'),
+ 'sudo', 'hwclock', '--systohc', '--utc',
+ run.Raw('&&'),
+ 'sudo', 'service', 'ntp', 'start',
+ run.Raw('||'),
+ 'true', # ignore errors; we may be racing with ntpd startup
+ ],
+ timeout=60,
+ )
+
+
+def check_console(hostname):
+ remote = Remote(hostname)
+ shortname = remote.shortname
+ console = getRemoteConsole(
+ name=hostname,
+ ipmiuser=config['ipmi_user'],
+ ipmipass=config['ipmi_password'],
+ ipmidomain=config['ipmi_domain'])
+ cname = '{host}.{domain}'.format(
+ host=shortname,
+ domain=config['ipmi_domain'])
+ log.info('checking console status of %s' % cname)
+ if console.check_status():
+ log.info('console ready on %s' % cname)
+ return
+ if console.check_power('on'):
+ log.info('attempting to reboot %s' % cname)
+ console.power_cycle()
+ else:
+ log.info('attempting to power on %s' % cname)
+ console.power_on()
+ timeout = 100
+ log.info('checking console status of %s with timeout %s' %
+ (cname, timeout))
+ if console.check_status(timeout=timeout):
+ log.info('console ready on %s' % cname)
+ else:
+ log.error(
+ "Failed to get console status for %s, " % cname
+ )
+
+
+def stale_openstack(ctx):
+ targets = dict(map(lambda i: (i['ID'], i),
+ OpenStack.list_instances()))
+ nodes = list_locks(keyed_by_name=True, locked=True)
+ stale_openstack_instances(ctx, targets, nodes)
+ stale_openstack_nodes(ctx, targets, nodes)
+ stale_openstack_volumes(ctx, OpenStack.list_volumes())
+ if not ctx.dry_run:
+ openstack_remove_again()
+
+#
+# A delay, in seconds, that is significantly longer than
+# any kind of OpenStack server creation / deletion / etc.
+#
+OPENSTACK_DELAY = 30 * 60
+
+
+def stale_openstack_instances(ctx, instances, locked_nodes):
+ for (instance_id, instance) in instances.iteritems():
+ i = OpenStackInstance(instance_id)
+ if not i.exists():
+ log.debug("stale-openstack: {instance} disappeared, ignored"
+ .format(instance=instance_id))
+ continue
+ if (i.get_created() >
+ config['max_job_time'] + OPENSTACK_DELAY):
+ log.info(
+ "stale-openstack: destroying instance {instance}"
+ " because it was created {created} seconds ago"
+ " which is older than"
+ " max_job_time {max_job_time} + {delay}"
+ .format(instance=i['name'],
+ created=i.get_created(),
+ max_job_time=config['max_job_time'],
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ i.destroy()
+ continue
+ name = canonicalize_hostname(i['name'], user=None)
+ if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes:
+ log.info("stale-openstack: destroying instance {instance}"
+ " because it was created {created} seconds ago"
+ " is older than {delay}s and it is not locked"
+ .format(instance=i['name'],
+ created=i.get_created(),
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ i.destroy()
+ continue
+ log.debug("stale-openstack: instance " + i['name'] + " OK")
+
+
+def openstack_delete_volume(id):
+ sh("openstack volume delete " + id + " || true")
+
+
+def stale_openstack_volumes(ctx, volumes):
+ now = datetime.datetime.now()
+ for volume in volumes:
+ volume_id = volume.get('ID') or volume['id']
+ try:
+ volume = json.loads(sh("openstack -q volume show -f json " +
+ volume_id))
+ except subprocess.CalledProcessError:
+ log.debug("stale-openstack: {id} disappeared, ignored"
+ .format(id=volume_id))
+ continue
+ volume_name = (volume.get('Display Name') or volume.get('display_name')
+ or volume['name'])
+ enforce_json_dictionary(volume)
+ created_at = datetime.datetime.strptime(
+ volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f')
+ created = (now - created_at).total_seconds()
+ if created > config['max_job_time'] + OPENSTACK_DELAY:
+ log.info(
+ "stale-openstack: destroying volume {volume}({id})"
+ " because it was created {created} seconds ago"
+ " which is older than"
+ " max_job_time {max_job_time} + {delay}"
+ .format(volume=volume_name,
+ id=volume_id,
+ created=created,
+ max_job_time=config['max_job_time'],
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ openstack_delete_volume(volume_id)
+ continue
+ log.debug("stale-openstack: volume " + volume_id + " OK")
+
+
+def stale_openstack_nodes(ctx, instances, locked_nodes):
+ names = set([ i['Name'] for i in instances.values() ])
+ for (name, node) in locked_nodes.iteritems():
+ name = decanonicalize_hostname(name)
+ if node['machine_type'] != 'openstack':
+ continue
+ if (name not in names and
+ locked_since_seconds(node) > OPENSTACK_DELAY):
+ log.info("stale-openstack: unlocking node {name} unlocked"
+ " because it was created {created}"
+ " seconds ago which is older than {delay}"
+ " and it has no instance"
+ .format(name=name,
+ created=locked_since_seconds(node),
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ unlock_one(ctx, name, node['locked_by'])
+ continue
+ log.debug("stale-openstack: node " + name + " OK")
+
+
+def openstack_remove_again():
+ """
+ Volumes and servers with REMOVE-ME in the name are leftover
+ that failed to be removed. It is not uncommon for a failed removal
+ to succeed later on.
+ """
+ sh("""
+ openstack server list --name REMOVE-ME --column ID --format value |
+ xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait
+ true
+ """)
+ sh("""
+ openstack volume list --name REMOVE-ME --column ID --format value |
+ xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete
+ true
+ """)
+
+
+def main(args):
+ ctx = FakeNamespace(args)
+ if ctx.verbose:
+ teuthology.log.setLevel(logging.DEBUG)
+
+ info = {}
+ if ctx.archive:
+ ctx.config = config_file(ctx.archive + '/config.yaml')
+ ifn = os.path.join(ctx.archive, 'info.yaml')
+ if os.path.exists(ifn):
+ with file(ifn, 'r') as fd:
+ info = yaml.load(fd.read())
+ if not ctx.pid:
+ ctx.pid = info.get('pid')
+ if not ctx.pid:
+ ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n'))
+ if not ctx.owner:
+ ctx.owner = info.get('owner')
+ if not ctx.owner:
+ ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n')
+
+ if ctx.targets:
+ ctx.config = merge_configs(ctx.targets)
+
+ if ctx.stale:
+ stale_nodes = find_stale_locks(ctx.owner)
+ targets = dict()
+ for node in stale_nodes:
+ targets[node['name']] = node['ssh_pub_key']
+ ctx.config = dict(targets=targets)
+
+ if ctx.stale_openstack:
+ stale_openstack(ctx)
+ return
+
+ log.info(
+ '\n '.join(
+ ['targets:', ] + yaml.safe_dump(
+ ctx.config['targets'],
+ default_flow_style=False).splitlines()))
+
+ if ctx.dry_run:
+ log.info("Not actually nuking anything since --dry-run was passed")
+ return
+
+ if ctx.owner is None:
+ ctx.owner = get_user()
+
+ if ctx.pid:
+ if ctx.archive:
+ log.info('Killing teuthology process at pid %d', ctx.pid)
+ os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % (
+ ctx.archive,
+ ctx.pid,
+ ctx.pid))
+ else:
+ subprocess.check_call(["kill", "-9", str(ctx.pid)])
+
+ nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi)
+
+
+def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False):
+ if 'targets' not in ctx.config:
+ return
+ total_unnuked = {}
+ targets = dict(ctx.config['targets'])
+ if ctx.name:
+ log.info('Checking targets against current locks')
+ locks = list_locks()
+ # Remove targets who's description doesn't match archive name.
+ for lock in locks:
+ for target in targets:
+ if target == lock['name']:
+ if ctx.name not in lock['description']:
+ del ctx.config['targets'][lock['name']]
+ log.info(
+ "Not nuking %s because description doesn't match",
+ lock['name'])
+ with parallel() as p:
+ for target, hostkey in ctx.config['targets'].iteritems():
+ p.spawn(
+ nuke_one,
+ ctx,
+ {target: hostkey},
+ should_unlock,
+ sync_clocks,
+ reboot_all,
+ ctx.config.get('check-locks', True),
+ noipmi,
+ )
+ for unnuked in p:
+ if unnuked:
+ total_unnuked.update(unnuked)
+ if total_unnuked:
+ log.error('Could not nuke the following targets:\n' +
+ '\n '.join(['targets:', ] +
+ yaml.safe_dump(
+ total_unnuked,
+ default_flow_style=False).splitlines()))
+
+
+def nuke_one(ctx, target, should_unlock, synch_clocks, reboot_all,
+ check_locks, noipmi):
+ ret = None
+ ctx = argparse.Namespace(
+ config=dict(targets=target),
+ owner=ctx.owner,
+ check_locks=check_locks,
+ synch_clocks=synch_clocks,
+ reboot_all=reboot_all,
+ teuthology_config=config.to_dict(),
+ name=ctx.name,
+ noipmi=noipmi,
+ )
+ try:
+ nuke_helper(ctx, should_unlock)
+ except Exception:
+ log.exception('Could not nuke %s' % target)
+ # not re-raising the so that parallel calls aren't killed
+ ret = target
+ else:
+ if should_unlock:
+ unlock_one(ctx, target.keys()[0], ctx.owner)
+ return ret
+
+
+def nuke_helper(ctx, should_unlock):
+ # ensure node is up with ipmi
+ (target,) = ctx.config['targets'].keys()
+ host = target.split('@')[-1]
+ shortname = host.split('.')[0]
+ if should_unlock:
+ if 'vpm' in shortname:
+ return
+ status_info = get_status(host)
+ if status_info['is_vm'] and status_info['machine_type'] == 'openstack':
+ return
+ log.debug('shortname: %s' % shortname)
+ log.debug('{ctx}'.format(ctx=ctx))
+ if (not ctx.noipmi and 'ipmi_user' in config and
+ 'vpm' not in shortname):
+ check_console(host)
+
+ if ctx.check_locks:
+ # does not check to ensure if the node is 'up'
+ # we want to be able to nuke a downed node
+ check_lock(ctx, None, check_up=False)
+ add_remotes(ctx, None)
+ connect(ctx, None)
+
+ log.info("Clearing teuthology firewall rules...")
+ clear_firewall(ctx)
+ log.info("Cleared teuthology firewall rules.")
+
+ log.info('Unmount ceph-fuse and killing daemons...')
+ shutdown_daemons(ctx)
+ log.info('All daemons killed.')
+ # Try to remove packages before reboot
+ remove_installed_packages(ctx)
+
+ remotes = ctx.cluster.remotes.keys()
+ reboot(ctx, remotes)
+ # shutdown daemons again incase of startup
+ log.info('Stop daemons after restart...')
+ shutdown_daemons(ctx)
+ log.info('All daemons killed.')
+ log.info('Unmount any osd data directories...')
+ remove_osd_mounts(ctx)
+ log.info('Unmount any osd tmpfs dirs...')
+ remove_osd_tmpfs(ctx)
+ log.info("Terminating Hadoop services...")
+ kill_hadoop(ctx)
+ log.info("Remove kernel mounts...")
+ remove_kernel_mounts(ctx)
+
+ log.info("Force remove ceph packages")
+ remove_ceph_packages(ctx)
+
+ log.info('Synchronizing clocks...')
+ synch_clocks(remotes)
+
+ log.info('Making sure firmware.git is not locked...')
+ ctx.cluster.run(args=['sudo', 'rm', '-f',
+ '/lib/firmware/updates/.git/index.lock', ])
+
+ remove_configuration_files(ctx)
+ log.info('Removing any multipath config/pkgs...')
+ undo_multipath(ctx)
+ log.info('Resetting syslog output locations...')
+ reset_syslog_dir(ctx)
+ log.info('Clearing filesystem of test data...')
+ remove_testing_tree(ctx)
+ log.info('Filesystem cleared.')
+ remove_yum_timedhosts(ctx)
+ # Once again remove packages after reboot
+ remove_installed_packages(ctx)
+ log.info('Installed packages removed.')