]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Support added for running scheduled tasks on virtual machines.
authorWarren Usui <warren.usui@inktank.com>
Fri, 7 Jun 2013 01:43:43 +0000 (18:43 -0700)
committerWarren Usui <warren.usui@inktank.com>
Sat, 8 Jun 2013 02:32:15 +0000 (19:32 -0700)
    This included:
    A). changes made so that full path names on some files were used
        (scheduled tasks started in different home directories).
    B.) Changes to insure tasks come up on the beanstalkc queue properly,
    C.) Finding and inserting the libvirt eqivalent code for vm machines
        in order to simulate ipmi actions,
    D.) Fix host key code, report valgrind issue more clearly.
    E.) Some message and downburst call changes.

    Fix #4988
    Fix #5122
Signed-off-by: Warren Usui <warren.usui@inktank.com>
bootstrap
teuthology/lock.py
teuthology/lockstatus.py
teuthology/misc.py
teuthology/nuke.py
teuthology/orchestra/remote.py
teuthology/task/internal.py
teuthology/task/thrashosds.py

index cf029b6c3cb1cbbd2026e563c3242a929e058753..ebecbad7b68ad1719adac745039ed8d2b8448400 100755 (executable)
--- a/bootstrap
+++ b/bootstrap
@@ -1,7 +1,7 @@
 #!/bin/sh
 set -e
 
-for package in python-dev python-pip python-virtualenv libevent-dev; do
+for package in python-dev python-pip python-virtualenv libevent-dev python-libvirt; do
     if [ "$(dpkg --status -- $package|sed -n 's/^Status: //p')" != "install ok installed" ]; then
        # add a space after old values
        missing="${missing:+$missing }$package"
@@ -13,7 +13,9 @@ if [ -n "$missing" ]; then
     exit 1
 fi
 
-virtualenv --no-site-packages --distribute virtualenv
+# site packages needed because libvirt python bindings are not nicely
+# packaged
+virtualenv --system-site-packages --distribute virtualenv
 
 # avoid pip bugs
 ./virtualenv/bin/pip install --upgrade pip
index 66f73c1edc1fab1e7da3e10ff8e856c15f2c679b..a716bfcbd846c70ac0b75e4a0b22f3377689ab08 100644 (file)
@@ -8,6 +8,7 @@ import re
 import collections
 import tempfile
 import os
+import time
 
 from teuthology import lockstatus as ls
 from teuthology import misc as teuthology
@@ -65,6 +66,13 @@ def list_locks(ctx):
     return None
 
 def update_lock(ctx, name, description=None, status=None, sshpubkey=None):
+    status_info = ls.get_status(ctx, name)
+    phys_host = status_info['vpshost']
+    if phys_host:
+        keyscan_out = ''
+        while not keyscan_out:
+            time.sleep(10)
+            keyscan_out, _ = keyscan_check(ctx, [name])
     updated = {}
     if description is not None:
         updated['desc'] = description
@@ -540,7 +548,7 @@ def create_if_vm(ctx, machine_name):
         if not file_out:
             file_info = {}
             file_info['disk-size'] = lcnfg.get('disk-size', '30G')
-            file_info['ram'] = lcnfg.get('ram', '4G')
+            file_info['ram'] = lcnfg.get('ram', '1.9G')
             file_info['cpus'] = lcnfg.get('cpus', 1)
             file_info['networks'] = lcnfg.get('networks',
                     [{'source' : 'front'}])
@@ -562,7 +570,7 @@ def create_if_vm(ctx, machine_name):
                 stdout=subprocess.PIPE,stderr=subprocess.PIPE,)
         owt,err = p.communicate()
         if err:
-            log.info("Downburst command to create %s failed: %s" %
+            log.info("Downburst completed on %s: %s" %
                     (machine_name,err))
         else:
             log.info("%s created: %s" % (machine_name,owt))
index 9e63614a6e6a641380b09d527205eee246103a38..5c25479efc7669cd5c3897f696fba6d1e47a2567 100644 (file)
@@ -5,8 +5,11 @@ import logging
 log = logging.getLogger(__name__)
 
 def _lock_url(ctx):
-    return ctx.teuthology_config['lock_server']
-
+    try:
+        return ctx.teuthology_config['lock_server']
+    except (AttributeError, KeyError):
+        return "http://teuthology.front.sepia.ceph.com/locker/lock"
 def send_request(method, url, body=None, headers=None):
     http = httplib2.Http()
     resp, content = http.request(url, method=method, body=body, headers=headers)
@@ -21,5 +24,3 @@ def get_status(ctx, name):
     if success:
         return json.loads(content)
     return None
-
-
index ce3e96d071751707905a4ac73d78daf17acb1993..1912c15151b711fd2e3d0ed751fc58cbb2311dda 100644 (file)
@@ -727,10 +727,14 @@ def reconnect(ctx, timeout, remotes=None):
         for remote in need_reconnect:
             try:
                 log.info('trying to connect to %s', remote.name)
+                key = ctx.config['targets'][remote.name]
+                kstat = lockstatus.get_status(ctx,remote.name)
+                if 'sshpubkey' in kstat:
+                    key = kstat['sshpubkey']
                 from .orchestra import connection
                 remote.ssh = connection.connect(
                     user_at_host=remote.name,
-                    host_key=ctx.config['targets'][remote.name],
+                    host_key=key,
                     keep_alive=True,
                     )
             except Exception:
index 8841edd83888eea12880a9586094e46103c6a1fa..5a8efae28c5a323bfac1e70e78f66c9fd891e2e1 100644 (file)
@@ -416,7 +416,7 @@ def nuke_helper(ctx, log):
     log.debug('shortname: %s' % shortname)
     log.debug('{ctx}'.format(ctx=ctx))
     if not ctx.noipmi and 'ipmi_user' in ctx.teuthology_config:
-        console = remote.RemoteConsole(name=host,
+        console = remote.getRemoteConsole(name=host,
                                        ipmiuser=ctx.teuthology_config['ipmi_user'],
                                        ipmipass=ctx.teuthology_config['ipmi_password'],
                                        ipmidomain=ctx.teuthology_config['ipmi_domain'])
index f6aa9672e118a7cce11b4760f8ec6f0fa24490b3..47824d5ee34547059cebc47abbdf5ebde169f03d 100644 (file)
@@ -46,22 +46,26 @@ class Remote(object):
 import pexpect
 import re
 import logging
+import libvirt
+from teuthology import lockstatus as ls
 
 log = logging.getLogger(__name__)
 
-class RemoteConsole(object):
+def getShortName(name):
+    hn = name.split('@')[-1]
+    p = re.compile('([^.]+)\.?.*')
+    return p.match(hn).groups()[0]
+
+class PhysicalConsole():
 
     def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
         self.name = name
+        self.shortname = getShortName(name)
+        self.timeout = timeout
+        self.logfile = None
         self.ipmiuser = ipmiuser
         self.ipmipass = ipmipass
         self.ipmidomain = ipmidomain
-        self.timeout = timeout
-        self.logfile = None
-
-        hn = self.name.split('@')[-1]
-        p = re.compile('([^.]+)\.?.*')
-        self.shortname = p.match(hn).groups()[0]
 
     def _exec(self, cmd):
         if not self.ipmiuser or not self.ipmipass or not self.ipmidomain:
@@ -83,6 +87,34 @@ class RemoteConsole(object):
             child.logfile = self.logfile
         return child
 
+    def _exit_session(self, child, timeout=None):
+        child.send('~.')
+        t = timeout
+        if not t:
+            t = self.timeout
+        r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t)
+        if r != 0:
+            self._exec('sol deactivate')
+
+    def _wait_for_login(self, timeout=None, attempts=6):
+        log.debug('Waiting for login prompt on {s}'.format(s=self.shortname))
+        # wait for login prompt to indicate boot completed
+        t = timeout
+        if not t:
+            t = self.timeout
+        for i in range(0, attempts):
+            start = time.time()
+            while time.time() - start < t:
+                child = self._exec('sol activate')
+                child.send('\n')
+                log.debug('expect: {s} login'.format(s=self.shortname))
+                r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start)))
+                log.debug('expect before: {b}'.format(b=child.before))
+                log.debug('expect after: {a}'.format(a=child.after))
+
+                self._exit_session(child)
+                if r == 0:
+                    return
     def check_power(self, state, timeout=None):
        # check power
        total_timeout = timeout
@@ -118,36 +150,6 @@ class RemoteConsole(object):
             log.info('Failed to get ipmi console status for {s}: {e}'.format(s=self.shortname, e=e))
             return False
 
-    def _exit_session(self, child, timeout=None):
-        child.send('~.')
-        t = timeout
-        if not t:
-            t = self.timeout
-        r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t)
-        if r != 0:
-            self._exec('sol deactivate')
-
-    def _wait_for_login(self, timeout=None, attempts=6):
-        log.debug('Waiting for login prompt on {s}'.format(s=self.shortname))
-        # wait for login prompt to indicate boot completed
-        t = timeout
-        if not t:
-            t = self.timeout
-        for i in range(0, attempts):
-            start = time.time()
-            while time.time() - start < t:
-                child = self._exec('sol activate')
-                child.send('\n')
-                log.debug('expect: {s} login'.format(s=self.shortname))
-                r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start)))
-                log.debug('expect before: {b}'.format(b=child.before))
-                log.debug('expect after: {a}'.format(a=child.after))
-
-                self._exit_session(child)
-                if r == 0:
-                    return
-        raise pexpect.TIMEOUT ('Timeout exceeded ({t}) for {a} attempts in _wait_for_login()'.format(t=t, a=attempts))
-
     def power_cycle(self):
         log.info('Power cycling {s}'.format(s=self.shortname))
         child = self._exec('power cycle')
@@ -166,18 +168,6 @@ class RemoteConsole(object):
         self._wait_for_login()
         log.info('Hard reset for {s} completed'.format(s=self.shortname))
 
-    def power_off(self):
-        log.info('Power off {s}'.format(s=self.shortname))
-        start = time.time()
-        while time.time() - start < self.timeout:
-            child = self._exec('power off')
-            r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout)
-            if r == 0:
-                break
-        if not self.check_power('off', 60):
-            log.error('Failed to power off {s}'.format(s=self.shortname))
-        log.info('Power off for {s} completed'.format(s=self.shortname))
-
     def power_on(self):
         log.info('Power on {s}'.format(s=self.shortname))
         start = time.time()
@@ -190,6 +180,18 @@ class RemoteConsole(object):
             log.error('Failed to power on {s}'.format(s=self.shortname))
         log.info('Power on for {s} completed'.format(s=self.shortname))
 
+    def power_off(self):
+        log.info('Power off {s}'.format(s=self.shortname))
+        start = time.time()
+        while time.time() - start < self.timeout:
+            child = self._exec('power off')
+            r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout)
+            if r == 0:
+                break
+        if not self.check_power('off', 60):
+            log.error('Failed to power off {s}'.format(s=self.shortname))
+        log.info('Power off for {s} completed'.format(s=self.shortname))
+
     def power_off_for_interval(self, interval=30):
         log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval))
         child = self._exec('power off')
@@ -199,7 +201,54 @@ class RemoteConsole(object):
 
         child = self._exec('power on')
         child.expect('Chassis Power Control: Up/On', timeout=self.timeout)
-
         self._wait_for_login()
+        log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval))
 
+class VirtualConsole():
+
+    def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
+        self.shortname = getShortName(name)
+        status_info = ls.get_status('', self.shortname)
+        try:
+            phys_host = status_info['vpshost']
+        except TypeError:
+            return
+        self.connection = libvirt.open(phys_host)
+        for i in self.connection.listDomainsID():
+            d = con.lookupByID(i)
+            if d.name() == self.shortname:
+                self.vm_domain = d
+                break
+        return
+
+    def check_power(self, state, timeout=None):
+        return self.vm_domain.info[0] in [libvirt.VIR_DOMAIN_RUNNING, libvirt.VIR_DOMAIN_BLOCKED,
+                libvirt.VIR_DOMAIN_PAUSED]
+
+    def check_status(self, timeout=None):
+        return self.vm_domain.info()[0]  == libvirt.VIR_DOMAIN_RUNNING 
+
+    def power_cycle(self):
+        self.vm_domain.info().destroy() 
+        self.vm_domain.info().create() 
+
+    def hard_reset(self):
+        self.vm_domain.info().destroy() 
+
+    def power_on(self):
+        self.vm_domain.info().create() 
+
+    def power_off(self):
+        self.vm_domain.info().destroy() 
+
+    def power_off_for_interval(self, interval=30):
+        log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval))
+        self.vm_domain.info().destroy() 
+        time.sleep(interval)
+        self.vm_domain.info().create() 
         log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval))
+
+def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
+    if name.startswith('vpm'):
+        return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
+    return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
index 53249685e1ac825b6465e9668f3b2b9180777892..444341eb4e5bf29b3a5738241bf2be6a22222775 100644 (file)
@@ -106,6 +106,7 @@ def lock_machines(ctx, config):
                 while not keyscan_out:
                     time.sleep(10)
                     keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist)
+                    log.info('virtual machine is stil unavailable')
                 if lock.update_keys(ctx, keyscan_out, current_locks):
                     log.info("Error in virtual machine keys")
                 newscandict = {}
@@ -172,8 +173,18 @@ def connect(ctx, config):
     from ..orchestra import connection, remote
     from ..orchestra import cluster
     remotes = []
-    for t, key in ctx.config['targets'].iteritems():
+    machs = []
+    for name in ctx.config['targets'].iterkeys():
+        machs.append(name)
+    lock.scan_for_locks(ctx, machs) 
+    for t, xkey in ctx.config['targets'].iteritems():
         log.debug('connecting to %s', t)
+        log.info('Key is :%s:', xkey) 
+        oldkeystatus = lockstatus.get_status(ctx, t)
+        key = xkey
+        if 'sshpubkey' in oldkeystatus:
+            log.info('possible key is :%s:',oldkeystatus['sshpubkey'])
+            key = oldkeystatus['sshpubkey']
         remotes.append(
             remote.Remote(name=t,
                           ssh=connection.connect(user_at_host=t,
@@ -492,16 +503,16 @@ def vm_setup(ctx, config):
     Look for virtual machines and handle their initialization
     """
     with parallel() as p:
-        editinfo = './teuthology/task/edit_sudoers.sh'
+        editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh')
         for remote in ctx.cluster.remotes.iterkeys():
             mname = re.match(".*@([^\.]*)\..*", str(remote)).group(1) 
-            if mname[0:3] == 'vpm':
+            if mname.startswith('vpm'):
                 r = remote.run(args=['test', '-e', '/ceph-qa-ready',],
                         stdout=StringIO(),
                         check_status=False,)
                 if r.exitstatus != 0:
                     p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE)
-                    p2 = subprocess.Popen(['ssh','-t','-t',str(remote)],stdin=p1.stdout, stdout=subprocess.PIPE)
+                    p2 = subprocess.Popen(['ssh','-t','-t',str(remote), 'sudo', 'sh'],stdin=p1.stdout, stdout=subprocess.PIPE)
                     _,err = p2.communicate()
                     if err:
                         log.info("Edit of /etc/sudoers failed: %s",err)
index ea997fdcfc0100e4edd6cce45fdd56f9e53852d7..2c788d47b03c3426daae2bcdf2fda24ee4b250b1 100644 (file)
@@ -105,7 +105,7 @@ def task(ctx, config):
                 host = t.split('@')[-1]
                 shortname = host.split('.')[0]
                 from ..orchestra import remote as oremote
-                console = oremote.RemoteConsole(
+                console = oremote.getRemoteConsole(
                     name=host,
                     ipmiuser=ctx.teuthology_config['ipmi_user'],
                     ipmipass=ctx.teuthology_config['ipmi_password'],