]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
nuke: cleanup stale OpenStack resources 723/head
authorLoic Dachary <ldachary@redhat.com>
Fri, 27 Nov 2015 20:41:57 +0000 (21:41 +0100)
committerLoic Dachary <ldachary@redhat.com>
Tue, 1 Dec 2015 11:25:07 +0000 (12:25 +0100)
Implement the --stale-openstack action to cleanup OpenStack resources
that are known to be stale because:

* instances or volumes have been created a long time ago
* instances or volumes with REMOVE-ME in the name
* nodes that are locked but did not get an instance

Only the instances and volumes known to belong to the teuthology cluster
because they have ownedby=IP in the name are considered, where IP is the
IP address of the machine running the teuthology clusters.

Signed-off-by: Loic Dachary <loic@dachary.org>
scripts/nuke.py
teuthology/nuke.py
teuthology/test/test_nuke.py [new file with mode: 0644]

index 53fa51ede237f1f62a3302876123c9356341101f..876312f9b41762f74d38325c46d425ea55fa2be9 100644 (file)
@@ -8,6 +8,7 @@ usage:
   teuthology-nuke [-v] [--owner OWNER] [-n NAME] [-u] [-i] [-r] [-s]
                        [-p PID] [--dry-run] (-t CONFIG... | -a DIR)
   teuthology-nuke [-v] [-u] [-i] [-r] [-s] [--dry-run] --owner OWNER --stale
+  teuthology-nuke [-v] [--dry-run] --stale-openstack
 
 Reset test machines
 
@@ -20,6 +21,8 @@ optional arguments:
                         archive path for a job to kill and nuke
   --stale               attempt to find and nuke 'stale' machines
                         (e.g. locked by jobs that are no longer running)
+  --stale-openstack     nuke 'stale' OpenStack instances and volumes
+                        and unlock OpenStack targets with no instance
   --dry-run             Don't actually nuke anything; just print the list of
                         targets that would be nuked
   --owner OWNER         job owner
index 8991b1bf4b97639c2f0c9c6f6da2b9d4d7a5c327..5b500baa288b7ee892505e0b9b1d9c682d6d3f6c 100644 (file)
@@ -1,4 +1,6 @@
 import argparse
+import datetime
+import json
 import logging
 import os
 import subprocess
@@ -9,24 +11,28 @@ from StringIO import StringIO
 import teuthology
 from . import orchestra
 import orchestra.remote
+from .openstack import OpenStack, OpenStackInstance
 from .orchestra import run
 from .config import FakeNamespace
 from .lock import list_locks
+from .lock import locked_since_seconds
 from .lock import unlock_one
 from .lock import find_stale_locks
 from .lockstatus import get_status
+from .misc import canonicalize_hostname
 from .misc import config_file
+from .misc import decanonicalize_hostname
 from .misc import merge_configs
 from .misc import get_testdir
 from .misc import get_user
 from .misc import reconnect
+from .misc import sh
 from .parallel import parallel
 from .task import install as install_task
 from .task.internal import check_lock, add_remotes, connect
 
 log = logging.getLogger(__name__)
 
-
 def clear_firewall(ctx):
     """
     Remove any iptables rules created by teuthology.  These rules are
@@ -361,6 +367,115 @@ def synch_clocks(remotes):
         log.info('Waiting for clock to synchronize on %s...', name)
         proc.wait()
 
+def stale_openstack(ctx):
+    targets = dict(map(lambda i: (i['Name'], i),
+                       OpenStack.list_instances()))
+    nodes = list_locks(keyed_by_name=True, locked=True)
+    stale_openstack_instances(ctx, targets, nodes)
+    stale_openstack_nodes(ctx, targets, nodes)
+    stale_openstack_volumes(ctx, OpenStack.list_volumes())
+    if not ctx.dry_run:
+        openstack_remove_again()
+
+#
+# A delay, in seconds, that is significantly longer than
+# any kind of OpenStack server creation / deletion / etc.
+#
+OPENSTACK_DELAY = 30 * 60
+
+def stale_openstack_instances(ctx, instances, locked_nodes):
+    for (name, instance) in instances.iteritems():
+        i = OpenStackInstance(name)
+        if (i.get_created() >
+            ctx.teuthology_config['max_job_time'] + OPENSTACK_DELAY):
+            log.info(
+                "stale-openstack: destroying instance {instance}" 
+                " because it was created {created} seconds ago"
+                " which is older than"
+                " max_job_time {max_job_time} + {delay}"
+                .format(instance=i['name'],
+                        created=i.get_created(),
+                        max_job_time=ctx.teuthology_config['max_job_time'],
+                        delay=OPENSTACK_DELAY))
+            if not ctx.dry_run:
+                i.destroy()
+            continue
+        name = canonicalize_hostname(i['name'], user=None)
+        if i.get_created() > OPENSTACK_DELAY and name not in locked_nodes:
+            log.info("stale-openstack: destroying instance {instance}" 
+                     " because it was created {created} seconds ago"
+                     " is older than {delay}s and it is not locked"
+                     .format(instance=i['name'],
+                             created=i.get_created(),
+                             delay=OPENSTACK_DELAY))
+            if not ctx.dry_run:
+                i.destroy()
+            continue
+        log.debug("stale-openstack: instance " + i['name'] + " OK")
+
+def openstack_delete_volume(id):
+    sh("openstack volume delete " + id + " || true")
+
+def stale_openstack_volumes(ctx, volumes):
+    now = datetime.datetime.now()
+    for volume in volumes:
+        volume = json.loads(sh("openstack volume show -f json " +
+                               volume['ID']))
+        volume = dict(map(lambda v: (v['Field'], v['Value']), volume))
+        created_at = datetime.datetime.strptime(
+            volume['created_at'], '%Y-%m-%dT%H:%M:%S.%f')
+        created = (now - created_at).total_seconds()
+        if created > ctx.teuthology_config['max_job_time'] + OPENSTACK_DELAY:
+            log.info(
+                "stale-openstack: destroying volume {volume}({id})"
+                " because it was created {created} seconds ago"
+                " which is older than"
+                " max_job_time {max_job_time} + {delay}"
+                .format(volume=volume['display_name'],
+                        id=volume['id'],
+                        created=created,
+                        max_job_time=ctx.teuthology_config['max_job_time'],
+                        delay=OPENSTACK_DELAY))
+            if not ctx.dry_run:
+                openstack_delete_volume(volume['id'])
+            continue
+        log.debug("stale-openstack: volume " + volume['id'] + " OK")
+
+def stale_openstack_nodes(ctx, instances, locked_nodes):
+    for (name, node) in locked_nodes.iteritems():
+        name = decanonicalize_hostname(name)
+        if node['machine_type'] != 'openstack':
+            continue
+        if (name not in instances and
+            locked_since_seconds(node) > OPENSTACK_DELAY):
+            log.info("stale-openstack: unlocking node {name} unlocked" 
+                     " because it was created {created}"
+                     " seconds ago which is older than {delay}"
+                     " and it has no instance"
+                     .format(name=name,
+                             created=locked_since_seconds(node),
+                             delay=OPENSTACK_DELAY))
+            if not ctx.dry_run:
+                unlock_one(ctx, name, node['locked_by'])
+            continue
+        log.debug("stale-openstack: node " + name + " OK")
+
+def openstack_remove_again():
+    """
+    Volumes and servers with REMOVE-ME in the name are leftover
+    that failed to be removed. It is not uncommon for a failed removal
+    to succeed later on.
+    """
+    sh("""
+    openstack server list --name REMOVE-ME --column ID --format value |
+    xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait
+    true
+    """)
+    sh("""
+    openstack volume list --name REMOVE-ME --column ID --format value |
+    xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete
+    true
+    """)
 
 def main(args):
     ctx = FakeNamespace(args)
@@ -393,6 +508,10 @@ def main(args):
             targets[node['name']] = node['ssh_pub_key']
         ctx.config = dict(targets=targets)
 
+    if ctx.stale_openstack:
+        stale_openstack(ctx)
+        return
+
     log.info(
         '\n  '.join(
             ['targets:', ] + yaml.safe_dump(
diff --git a/teuthology/test/test_nuke.py b/teuthology/test/test_nuke.py
new file mode 100644 (file)
index 0000000..bef19dc
--- /dev/null
@@ -0,0 +1,204 @@
+import json
+import datetime
+
+from mock import patch, Mock, DEFAULT
+
+from teuthology import nuke
+from teuthology import misc
+from teuthology.config import config
+
+
+class TestNuke(object):
+
+    def test_stale_openstack_volumes(self):
+        ctx = Mock()
+        ctx.teuthology_config = config
+        ctx.dry_run = False
+        now = datetime.datetime.strftime(datetime.datetime.now(),
+                                         "%Y-%m-%dT%H:%M:%S.000000")
+        id = '4bee3af9-febb-40c1-a17e-ff63edb415c5'
+        name = 'target1-0'
+        volume_list = json.loads(
+            '[{'
+            ' "ID": "' + id + '"'
+            '}]'
+        )
+        #
+        # A volume created a second ago is left untouched
+        #
+        volume_show = (
+            '['
+            ' {"Field": "id", "Value": "' + id + '"},'
+            ' {"Field": "created_at", "Value": "' + now + '"},'
+            ' {"Field": "display_name", "Value": "' + name + '"}'
+            ']'
+        )
+        def sh(cmd):
+            if 'volume show' in cmd:
+                return volume_show
+
+        with patch.multiple(
+                nuke,
+                sh=sh,
+                openstack_delete_volume=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_volumes(ctx, volume_list)
+            m['openstack_delete_volume'].assert_not_called()
+
+        #
+        # A volume created long ago is destroyed
+        #
+        ancient = "2000-11-02T15:43:12.000000"
+        volume_show = (
+            '['
+            ' {"Field": "id", "Value": "' + id + '"},'
+            ' {"Field": "created_at", "Value": "' + ancient + '"},'
+            ' {"Field": "display_name", "Value": "' + name + '"}'
+            ']'
+        )
+        def sh(cmd):
+            if 'volume show' in cmd:
+                return volume_show
+
+        with patch.multiple(
+                nuke,
+                sh=sh,
+                openstack_delete_volume=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_volumes(ctx, volume_list)
+            m['openstack_delete_volume'].assert_called_with(id)
+
+    def test_stale_openstack_nodes(self):
+        ctx = Mock()
+        ctx.teuthology_config = config
+        ctx.dry_run = False
+        name = 'target1'
+        now = datetime.datetime.strftime(datetime.datetime.now(),
+                                         "%Y-%m-%d %H:%M:%S.%f")
+        #
+        # A node is not of type openstack is left untouched
+        #
+        with patch.multiple(
+                nuke,
+                unlock_one=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_nodes(ctx, {
+            }, {
+                name: { 'locked_since': now,
+                        'machine_type': 'mira', },
+            })
+            m['unlock_one'].assert_not_called()
+        #
+        # A node that was just locked and does not have
+        # an instance yet is left untouched
+        #
+        with patch.multiple(
+                nuke,
+                unlock_one=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_nodes(ctx, {
+            }, {
+                name: { 'locked_since': now,
+                        'machine_type': 'openstack', },
+            })
+            m['unlock_one'].assert_not_called()
+        #
+        # A node that has been locked for some time and
+        # has no instance is unlocked.
+        #
+        ancient = "2000-11-02 15:43:12.000000"
+        me = 'loic@dachary.org'
+        with patch.multiple(
+                nuke,
+                unlock_one=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_nodes(ctx, {
+            }, {
+                name: { 'locked_since': ancient,
+                        'locked_by': me,
+                        'machine_type': 'openstack', },
+            })
+            m['unlock_one'].assert_called_with(
+                ctx, name, me)
+        #
+        # A node that has been locked for some time and
+        # has an instance is left untouched
+        #
+        with patch.multiple(
+                nuke,
+                unlock_one=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_nodes(ctx, {
+                name: { 'name': name, },
+            }, {
+                name: { 'locked_since': ancient,
+                        'machine_type': 'openstack', },
+            })
+            m['unlock_one'].assert_not_called()
+
+    def test_stale_openstack_instances(self):
+        ctx = Mock()
+        ctx.teuthology_config = config
+        ctx.dry_run = False
+        name = 'target1'
+        #
+        # An instance created a second ago is left untouched,
+        # even when it is not locked.
+        #
+        with patch.multiple(
+                nuke.OpenStackInstance,
+                get_created=lambda _: 1,
+                __getitem__=lambda _, key: name,
+                destroy=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_instances(ctx, {
+                name: { 'id': 'UUID', },
+            }, {
+            })
+            m['destroy'].assert_not_called()
+        #
+        # An instance created a very long time ago is destroyed
+        #
+        with patch.multiple(
+                nuke.OpenStackInstance,
+                get_created=lambda _: 1000000000,
+                __getitem__=lambda _, key: name,
+                destroy=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_instances(ctx, {
+                name: { 'id': 'UUID', },
+            }, {
+                misc.canonicalize_hostname(name, user=None): {},
+            })
+            m['destroy'].assert_called_with()
+        #
+        # An instance created but not locked after a while is
+        # destroyed.
+        #
+        with patch.multiple(
+                nuke.OpenStackInstance,
+                get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
+                __getitem__=lambda _, key: name,
+                destroy=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_instances(ctx, {
+                name: { 'id': 'UUID', },
+            }, {
+            })
+            m['destroy'].assert_called_with()
+        #
+        # An instance created within the expected lifetime
+        # of a job and locked is left untouched.
+        #
+        with patch.multiple(
+                nuke.OpenStackInstance,
+                get_created=lambda _: nuke.OPENSTACK_DELAY + 1,
+                __getitem__=lambda _, key: name,
+                destroy=DEFAULT,
+                ) as m:
+            nuke.stale_openstack_instances(ctx, {
+                name: { 'id': 'UUID', },
+            }, {
+                misc.canonicalize_hostname(name, user=None): {},
+            })
+            m['destroy'].assert_not_called()