teuthology-nuke [-v] [--owner OWNER] [-n NAME] [-u] [-i] [-r] [-s]
[-p PID] [--dry-run] (-t CONFIG... | -a DIR)
teuthology-nuke [-v] [-u] [-i] [-r] [-s] [--dry-run] --owner OWNER --stale
+ teuthology-nuke [-v] [--dry-run] --stale-openstack
Reset test machines
archive path for a job to kill and nuke
--stale attempt to find and nuke 'stale' machines
(e.g. locked by jobs that are no longer running)
+ --stale-openstack nuke 'stale' OpenStack instances and volumes
+ and unlock OpenStack targets with no instance
--dry-run Don't actually nuke anything; just print the list of
targets that would be nuked
--owner OWNER job owner
import argparse
+import datetime
+import json
import logging
import os
import subprocess
import teuthology
from . import orchestra
import orchestra.remote
+from .openstack import OpenStack, OpenStackInstance
from .orchestra import run
from .config import FakeNamespace
from .lock import list_locks
+from .lock import locked_since_seconds
from .lock import unlock_one
from .lock import find_stale_locks
from .lockstatus import get_status
+from .misc import canonicalize_hostname
from .misc import config_file
+from .misc import decanonicalize_hostname
from .misc import merge_configs
from .misc import get_testdir
from .misc import get_user
from .misc import reconnect
+from .misc import sh
from .parallel import parallel
from .task import install as install_task
from .task.internal import check_lock, add_remotes, connect
log = logging.getLogger(__name__)
-
def clear_firewall(ctx):
"""
Remove any iptables rules created by teuthology. These rules are
log.info('Waiting for clock to synchronize on %s...', name)
proc.wait()
+def stale_openstack(ctx):
+ targets = dict(map(lambda i: (i['Name'], i),
+ OpenStack.list_instances()))
+ nodes = list_locks(keyed_by_name=True, locked=True)
+ stale_openstack_instances(ctx, targets, nodes)
+ stale_openstack_nodes(ctx, targets, nodes)
+ stale_openstack_volumes(ctx, OpenStack.list_volumes())
+ if not ctx.dry_run:
+ openstack_remove_again()
+
+#
+# A delay, in seconds, that is significantly longer than
+# any kind of OpenStack server creation / deletion / etc.
+#
+OPENSTACK_DELAY = 30 * 60
+
+def stale_openstack_instances(ctx, instances, locked_nodes):
+ for (name, instance) in instances.iteritems():
+ i = OpenStackInstance(name)
+ if (i.get_created() >
+ ctx.teuthology_config['max_job_time'] + OPENSTACK_DELAY):
+ log.info(
+ "stale-openstack: destroying instance {instance}"
+ " because it was created {created} seconds ago"
+ " which is older than"
+ " max_job_time {max_job_time} + {delay}"
+ .format(instance=i['name'],
+ created=i.get_created(),
+ max_job_time=ctx.teuthology_config['max_job_time'],
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ i.destroy()
+ continue
+ name = canonicalize_hostname(i['name'], user=None)
+ if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes:
+ log.info("stale-openstack: destroying instance {instance}"
+ " because it was created {created} seconds ago"
+ " is older than {delay}s and it is not locked"
+ .format(instance=i['name'],
+ created=i.get_created(),
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ i.destroy()
+ continue
+ log.debug("stale-openstack: instance " + i['name'] + " OK")
+
+def openstack_delete_volume(id):
+ sh("openstack volume delete " + id + " || true")
+
+def stale_openstack_volumes(ctx, volumes):
+ now = datetime.datetime.now()
+ for volume in volumes:
+ volume = json.loads(sh("openstack volume show -f json " +
+ volume['ID']))
+ volume = dict(map(lambda v: (v['Field'], v['Value']), volume))
+ created_at = datetime.datetime.strptime(
+ volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f')
+ created = (now - created_at).total_seconds()
+ if created > ctx.teuthology_config['max_job_time'] + OPENSTACK_DELAY:
+ log.info(
+ "stale-openstack: destroying volume {volume}({id})"
+ " because it was created {created} seconds ago"
+ " which is older than"
+ " max_job_time {max_job_time} + {delay}"
+ .format(volume=volume['display_name'],
+ id=volume['id'],
+ created=created,
+ max_job_time=ctx.teuthology_config['max_job_time'],
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ openstack_delete_volume(volume['id'])
+ continue
+ log.debug("stale-openstack: volume " + volume['id'] + " OK")
+
+def stale_openstack_nodes(ctx, instances, locked_nodes):
+ for (name, node) in locked_nodes.iteritems():
+ name = decanonicalize_hostname(name)
+ if node['machine_type'] != 'openstack':
+ continue
+ if (name not in instances and
+ locked_since_seconds(node) > OPENSTACK_DELAY):
+ log.info("stale-openstack: unlocking node {name} unlocked"
+ " because it was created {created}"
+ " seconds ago which is older than {delay}"
+ " and it has no instance"
+ .format(name=name,
+ created=locked_since_seconds(node),
+ delay=OPENSTACK_DELAY))
+ if not ctx.dry_run:
+ unlock_one(ctx, name, node['locked_by'])
+ continue
+ log.debug("stale-openstack: node " + name + " OK")
+
+def openstack_remove_again():
+ """
+ Volumes and servers with REMOVE-ME in the name are leftover
+ that failed to be removed. It is not uncommon for a failed removal
+ to succeed later on.
+ """
+ sh("""
+ openstack server list --name REMOVE-ME --column ID --format value |
+ xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait
+ true
+ """)
+ sh("""
+ openstack volume list --name REMOVE-ME --column ID --format value |
+ xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete
+ true
+ """)
def main(args):
ctx = FakeNamespace(args)
targets[node['name']] = node['ssh_pub_key']
ctx.config = dict(targets=targets)
+ if ctx.stale_openstack:
+ stale_openstack(ctx)
+ return
+
log.info(
'\n '.join(
['targets:', ] + yaml.safe_dump(
--- /dev/null
+import json
+import datetime
+
+from mock import patch, Mock, DEFAULT
+
+from teuthology import nuke
+from teuthology import misc
+from teuthology.config import config
+
+
+class TestNuke(object):
+
+ def test_stale_openstack_volumes(self):
+ ctx = Mock()
+ ctx.teuthology_config = config
+ ctx.dry_run = False
+ now = datetime.datetime.strftime(datetime.datetime.now(),
+ "%Y-%m-%dT%H:%M:%S.000000")
+ id = '4bee3af9-febb-40c1-a17e-ff63edb415c5'
+ name = 'target1-0'
+ volume_list = json.loads(
+ '[{'
+ ' "ID": "' + id + '"'
+ '}]'
+ )
+ #
+ # A volume created a second ago is left untouched
+ #
+ volume_show = (
+ '['
+ ' {"Field": "id", "Value": "' + id + '"},'
+ ' {"Field": "created_at", "Value": "' + now + '"},'
+ ' {"Field": "display_name", "Value": "' + name + '"}'
+ ']'
+ )
+ def sh(cmd):
+ if 'volume show' in cmd:
+ return volume_show
+
+ with patch.multiple(
+ nuke,
+ sh=sh,
+ openstack_delete_volume=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_volumes(ctx, volume_list)
+ m['openstack_delete_volume'].assert_not_called()
+
+ #
+ # A volume created long ago is destroyed
+ #
+ ancient = "2000-11-02T15:43:12.000000"
+ volume_show = (
+ '['
+ ' {"Field": "id", "Value": "' + id + '"},'
+ ' {"Field": "created_at", "Value": "' + ancient + '"},'
+ ' {"Field": "display_name", "Value": "' + name + '"}'
+ ']'
+ )
+ def sh(cmd):
+ if 'volume show' in cmd:
+ return volume_show
+
+ with patch.multiple(
+ nuke,
+ sh=sh,
+ openstack_delete_volume=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_volumes(ctx, volume_list)
+ m['openstack_delete_volume'].assert_called_with(id)
+
+ def test_stale_openstack_nodes(self):
+ ctx = Mock()
+ ctx.teuthology_config = config
+ ctx.dry_run = False
+ name = 'target1'
+ now = datetime.datetime.strftime(datetime.datetime.now(),
+ "%Y-%m-%d %H:%M:%S.%f")
+ #
+ # A node is not of type openstack is left untouched
+ #
+ with patch.multiple(
+ nuke,
+ unlock_one=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_nodes(ctx, {
+ }, {
+ name: { 'locked_since': now,
+ 'machine_type': 'mira', },
+ })
+ m['unlock_one'].assert_not_called()
+ #
+ # A node that was just locked and does not have
+ # an instance yet is left untouched
+ #
+ with patch.multiple(
+ nuke,
+ unlock_one=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_nodes(ctx, {
+ }, {
+ name: { 'locked_since': now,
+ 'machine_type': 'openstack', },
+ })
+ m['unlock_one'].assert_not_called()
+ #
+ # A node that has been locked for some time and
+ # has no instance is unlocked.
+ #
+ ancient = "2000-11-02 15:43:12.000000"
+ me = 'loic@dachary.org'
+ with patch.multiple(
+ nuke,
+ unlock_one=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_nodes(ctx, {
+ }, {
+ name: { 'locked_since': ancient,
+ 'locked_by': me,
+ 'machine_type': 'openstack', },
+ })
+ m['unlock_one'].assert_called_with(
+ ctx, name, me)
+ #
+ # A node that has been locked for some time and
+ # has an instance is left untouched
+ #
+ with patch.multiple(
+ nuke,
+ unlock_one=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_nodes(ctx, {
+ name: { 'name': name, },
+ }, {
+ name: { 'locked_since': ancient,
+ 'machine_type': 'openstack', },
+ })
+ m['unlock_one'].assert_not_called()
+
+ def test_stale_openstack_instances(self):
+ ctx = Mock()
+ ctx.teuthology_config = config
+ ctx.dry_run = False
+ name = 'target1'
+ #
+ # An instance created a second ago is left untouched,
+ # even when it is not locked.
+ #
+ with patch.multiple(
+ nuke.OpenStackInstance,
+ get_created=lambda _: 1,
+ __getitem__=lambda _, key: name,
+ destroy=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_instances(ctx, {
+ name: { 'id': 'UUID', },
+ }, {
+ })
+ m['destroy'].assert_not_called()
+ #
+ # An instance created a very long time ago is destroyed
+ #
+ with patch.multiple(
+ nuke.OpenStackInstance,
+ get_created=lambda _: 1000000000,
+ __getitem__=lambda _, key: name,
+ destroy=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_instances(ctx, {
+ name: { 'id': 'UUID', },
+ }, {
+ misc.canonicalize_hostname(name, user=None): {},
+ })
+ m['destroy'].assert_called_with()
+ #
+ # An instance created but not locked after a while is
+ # destroyed.
+ #
+ with patch.multiple(
+ nuke.OpenStackInstance,
+ get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
+ __getitem__=lambda _, key: name,
+ destroy=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_instances(ctx, {
+ name: { 'id': 'UUID', },
+ }, {
+ })
+ m['destroy'].assert_called_with()
+ #
+ # An instance created within the expected lifetime
+ # of a job and locked is left untouched.
+ #
+ with patch.multiple(
+ nuke.OpenStackInstance,
+ get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
+ __getitem__=lambda _, key: name,
+ destroy=DEFAULT,
+ ) as m:
+ nuke.stale_openstack_instances(ctx, {
+ name: { 'id': 'UUID', },
+ }, {
+ misc.canonicalize_hostname(name, user=None): {},
+ })
+ m['destroy'].assert_not_called()