From a4994e3bde139da025ba1b068c1e77f776106272 Mon Sep 17 00:00:00 2001 From: Warren Usui Date: Thu, 6 Jun 2013 18:43:43 -0700 Subject: [PATCH] Support added for running scheduled tasks on virtual machines. This included: A). changes made so that full path names on some files were used (scheduled tasks started in different home directories). B.) Changes to insure tasks come up on the beanstalkc queue properly, C.) Finding and inserting the libvirt eqivalent code for vm machines in order to simulate ipmi actions, D.) Fix host key code, report valgrind issue more clearly. E.) Some message and downburst call changes. Fix #4988 Fix #5122 Signed-off-by: Warren Usui --- bootstrap | 6 +- teuthology/lock.py | 12 ++- teuthology/lockstatus.py | 9 +- teuthology/misc.py | 6 +- teuthology/nuke.py | 2 +- teuthology/orchestra/remote.py | 149 ++++++++++++++++++++++----------- teuthology/task/internal.py | 19 ++++- teuthology/task/thrashosds.py | 2 +- 8 files changed, 140 insertions(+), 65 deletions(-) diff --git a/bootstrap b/bootstrap index cf029b6c3c..ebecbad7b6 100755 --- a/bootstrap +++ b/bootstrap @@ -1,7 +1,7 @@ #!/bin/sh set -e -for package in python-dev python-pip python-virtualenv libevent-dev; do +for package in python-dev python-pip python-virtualenv libevent-dev python-libvirt; do if [ "$(dpkg --status -- $package|sed -n 's/^Status: //p')" != "install ok installed" ]; then # add a space after old values missing="${missing:+$missing }$package" @@ -13,7 +13,9 @@ if [ -n "$missing" ]; then exit 1 fi -virtualenv --no-site-packages --distribute virtualenv +# site packages needed because libvirt python bindings are not nicely +# packaged +virtualenv --system-site-packages --distribute virtualenv # avoid pip bugs ./virtualenv/bin/pip install --upgrade pip diff --git a/teuthology/lock.py b/teuthology/lock.py index 66f73c1edc..a716bfcbd8 100644 --- a/teuthology/lock.py +++ b/teuthology/lock.py @@ -8,6 +8,7 @@ import re import collections import tempfile import os +import time from teuthology import lockstatus as ls from teuthology import misc as teuthology @@ -65,6 +66,13 @@ def list_locks(ctx): return None def update_lock(ctx, name, description=None, status=None, sshpubkey=None): + status_info = ls.get_status(ctx, name) + phys_host = status_info['vpshost'] + if phys_host: + keyscan_out = '' + while not keyscan_out: + time.sleep(10) + keyscan_out, _ = keyscan_check(ctx, [name]) updated = {} if description is not None: updated['desc'] = description @@ -540,7 +548,7 @@ def create_if_vm(ctx, machine_name): if not file_out: file_info = {} file_info['disk-size'] = lcnfg.get('disk-size', '30G') - file_info['ram'] = lcnfg.get('ram', '4G') + file_info['ram'] = lcnfg.get('ram', '1.9G') file_info['cpus'] = lcnfg.get('cpus', 1) file_info['networks'] = lcnfg.get('networks', [{'source' : 'front'}]) @@ -562,7 +570,7 @@ def create_if_vm(ctx, machine_name): stdout=subprocess.PIPE,stderr=subprocess.PIPE,) owt,err = p.communicate() if err: - log.info("Downburst command to create %s failed: %s" % + log.info("Downburst completed on %s: %s" % (machine_name,err)) else: log.info("%s created: %s" % (machine_name,owt)) diff --git a/teuthology/lockstatus.py b/teuthology/lockstatus.py index 9e63614a6e..5c25479efc 100644 --- a/teuthology/lockstatus.py +++ b/teuthology/lockstatus.py @@ -5,8 +5,11 @@ import logging log = logging.getLogger(__name__) def _lock_url(ctx): - return ctx.teuthology_config['lock_server'] - + try: + return ctx.teuthology_config['lock_server'] + except (AttributeError, KeyError): + return "http://teuthology.front.sepia.ceph.com/locker/lock" + def send_request(method, url, body=None, headers=None): http = httplib2.Http() resp, content = http.request(url, method=method, body=body, headers=headers) @@ -21,5 +24,3 @@ def get_status(ctx, name): if success: return json.loads(content) return None - - diff --git a/teuthology/misc.py b/teuthology/misc.py index ce3e96d071..1912c15151 100644 --- a/teuthology/misc.py +++ b/teuthology/misc.py @@ -727,10 +727,14 @@ def reconnect(ctx, timeout, remotes=None): for remote in need_reconnect: try: log.info('trying to connect to %s', remote.name) + key = ctx.config['targets'][remote.name] + kstat = lockstatus.get_status(ctx,remote.name) + if 'sshpubkey' in kstat: + key = kstat['sshpubkey'] from .orchestra import connection remote.ssh = connection.connect( user_at_host=remote.name, - host_key=ctx.config['targets'][remote.name], + host_key=key, keep_alive=True, ) except Exception: diff --git a/teuthology/nuke.py b/teuthology/nuke.py index 8841edd838..5a8efae28c 100644 --- a/teuthology/nuke.py +++ b/teuthology/nuke.py @@ -416,7 +416,7 @@ def nuke_helper(ctx, log): log.debug('shortname: %s' % shortname) log.debug('{ctx}'.format(ctx=ctx)) if not ctx.noipmi and 'ipmi_user' in ctx.teuthology_config: - console = remote.RemoteConsole(name=host, + console = remote.getRemoteConsole(name=host, ipmiuser=ctx.teuthology_config['ipmi_user'], ipmipass=ctx.teuthology_config['ipmi_password'], ipmidomain=ctx.teuthology_config['ipmi_domain']) diff --git a/teuthology/orchestra/remote.py b/teuthology/orchestra/remote.py index f6aa9672e1..47824d5ee3 100644 --- a/teuthology/orchestra/remote.py +++ b/teuthology/orchestra/remote.py @@ -46,22 +46,26 @@ class Remote(object): import pexpect import re import logging +import libvirt +from teuthology import lockstatus as ls log = logging.getLogger(__name__) -class RemoteConsole(object): +def getShortName(name): + hn = name.split('@')[-1] + p = re.compile('([^.]+)\.?.*') + return p.match(hn).groups()[0] + +class PhysicalConsole(): def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): self.name = name + self.shortname = getShortName(name) + self.timeout = timeout + self.logfile = None self.ipmiuser = ipmiuser self.ipmipass = ipmipass self.ipmidomain = ipmidomain - self.timeout = timeout - self.logfile = None - - hn = self.name.split('@')[-1] - p = re.compile('([^.]+)\.?.*') - self.shortname = p.match(hn).groups()[0] def _exec(self, cmd): if not self.ipmiuser or not self.ipmipass or not self.ipmidomain: @@ -83,6 +87,34 @@ class RemoteConsole(object): child.logfile = self.logfile return child + def _exit_session(self, child, timeout=None): + child.send('~.') + t = timeout + if not t: + t = self.timeout + r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t) + if r != 0: + self._exec('sol deactivate') + + def _wait_for_login(self, timeout=None, attempts=6): + log.debug('Waiting for login prompt on {s}'.format(s=self.shortname)) + # wait for login prompt to indicate boot completed + t = timeout + if not t: + t = self.timeout + for i in range(0, attempts): + start = time.time() + while time.time() - start < t: + child = self._exec('sol activate') + child.send('\n') + log.debug('expect: {s} login'.format(s=self.shortname)) + r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start))) + log.debug('expect before: {b}'.format(b=child.before)) + log.debug('expect after: {a}'.format(a=child.after)) + + self._exit_session(child) + if r == 0: + return def check_power(self, state, timeout=None): # check power total_timeout = timeout @@ -118,36 +150,6 @@ class RemoteConsole(object): log.info('Failed to get ipmi console status for {s}: {e}'.format(s=self.shortname, e=e)) return False - def _exit_session(self, child, timeout=None): - child.send('~.') - t = timeout - if not t: - t = self.timeout - r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t) - if r != 0: - self._exec('sol deactivate') - - def _wait_for_login(self, timeout=None, attempts=6): - log.debug('Waiting for login prompt on {s}'.format(s=self.shortname)) - # wait for login prompt to indicate boot completed - t = timeout - if not t: - t = self.timeout - for i in range(0, attempts): - start = time.time() - while time.time() - start < t: - child = self._exec('sol activate') - child.send('\n') - log.debug('expect: {s} login'.format(s=self.shortname)) - r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start))) - log.debug('expect before: {b}'.format(b=child.before)) - log.debug('expect after: {a}'.format(a=child.after)) - - self._exit_session(child) - if r == 0: - return - raise pexpect.TIMEOUT ('Timeout exceeded ({t}) for {a} attempts in _wait_for_login()'.format(t=t, a=attempts)) - def power_cycle(self): log.info('Power cycling {s}'.format(s=self.shortname)) child = self._exec('power cycle') @@ -166,18 +168,6 @@ class RemoteConsole(object): self._wait_for_login() log.info('Hard reset for {s} completed'.format(s=self.shortname)) - def power_off(self): - log.info('Power off {s}'.format(s=self.shortname)) - start = time.time() - while time.time() - start < self.timeout: - child = self._exec('power off') - r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout) - if r == 0: - break - if not self.check_power('off', 60): - log.error('Failed to power off {s}'.format(s=self.shortname)) - log.info('Power off for {s} completed'.format(s=self.shortname)) - def power_on(self): log.info('Power on {s}'.format(s=self.shortname)) start = time.time() @@ -190,6 +180,18 @@ class RemoteConsole(object): log.error('Failed to power on {s}'.format(s=self.shortname)) log.info('Power on for {s} completed'.format(s=self.shortname)) + def power_off(self): + log.info('Power off {s}'.format(s=self.shortname)) + start = time.time() + while time.time() - start < self.timeout: + child = self._exec('power off') + r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout) + if r == 0: + break + if not self.check_power('off', 60): + log.error('Failed to power off {s}'.format(s=self.shortname)) + log.info('Power off for {s} completed'.format(s=self.shortname)) + def power_off_for_interval(self, interval=30): log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval)) child = self._exec('power off') @@ -199,7 +201,54 @@ class RemoteConsole(object): child = self._exec('power on') child.expect('Chassis Power Control: Up/On', timeout=self.timeout) - self._wait_for_login() + log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval)) +class VirtualConsole(): + + def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): + self.shortname = getShortName(name) + status_info = ls.get_status('', self.shortname) + try: + phys_host = status_info['vpshost'] + except TypeError: + return + self.connection = libvirt.open(phys_host) + for i in self.connection.listDomainsID(): + d = con.lookupByID(i) + if d.name() == self.shortname: + self.vm_domain = d + break + return + + def check_power(self, state, timeout=None): + return self.vm_domain.info[0] in [libvirt.VIR_DOMAIN_RUNNING, libvirt.VIR_DOMAIN_BLOCKED, + libvirt.VIR_DOMAIN_PAUSED] + + def check_status(self, timeout=None): + return self.vm_domain.info()[0] == libvirt.VIR_DOMAIN_RUNNING + + def power_cycle(self): + self.vm_domain.info().destroy() + self.vm_domain.info().create() + + def hard_reset(self): + self.vm_domain.info().destroy() + + def power_on(self): + self.vm_domain.info().create() + + def power_off(self): + self.vm_domain.info().destroy() + + def power_off_for_interval(self, interval=30): + log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval)) + self.vm_domain.info().destroy() + time.sleep(interval) + self.vm_domain.info().create() log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval)) + +def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): + if name.startswith('vpm'): + return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout) + return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout) diff --git a/teuthology/task/internal.py b/teuthology/task/internal.py index 53249685e1..444341eb4e 100644 --- a/teuthology/task/internal.py +++ b/teuthology/task/internal.py @@ -106,6 +106,7 @@ def lock_machines(ctx, config): while not keyscan_out: time.sleep(10) keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist) + log.info('virtual machine is stil unavailable') if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} @@ -172,8 +173,18 @@ def connect(ctx, config): from ..orchestra import connection, remote from ..orchestra import cluster remotes = [] - for t, key in ctx.config['targets'].iteritems(): + machs = [] + for name in ctx.config['targets'].iterkeys(): + machs.append(name) + lock.scan_for_locks(ctx, machs) + for t, xkey in ctx.config['targets'].iteritems(): log.debug('connecting to %s', t) + log.info('Key is :%s:', xkey) + oldkeystatus = lockstatus.get_status(ctx, t) + key = xkey + if 'sshpubkey' in oldkeystatus: + log.info('possible key is :%s:',oldkeystatus['sshpubkey']) + key = oldkeystatus['sshpubkey'] remotes.append( remote.Remote(name=t, ssh=connection.connect(user_at_host=t, @@ -492,16 +503,16 @@ def vm_setup(ctx, config): Look for virtual machines and handle their initialization """ with parallel() as p: - editinfo = './teuthology/task/edit_sudoers.sh' + editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh') for remote in ctx.cluster.remotes.iterkeys(): mname = re.match(".*@([^\.]*)\..*", str(remote)).group(1) - if mname[0:3] == 'vpm': + if mname.startswith('vpm'): r = remote.run(args=['test', '-e', '/ceph-qa-ready',], stdout=StringIO(), check_status=False,) if r.exitstatus != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['ssh','-t','-t',str(remote)],stdin=p1.stdout, stdout=subprocess.PIPE) + p2 = subprocess.Popen(['ssh','-t','-t',str(remote), 'sudo', 'sh'],stdin=p1.stdout, stdout=subprocess.PIPE) _,err = p2.communicate() if err: log.info("Edit of /etc/sudoers failed: %s",err) diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py index ea997fdcfc..2c788d47b0 100644 --- a/teuthology/task/thrashosds.py +++ b/teuthology/task/thrashosds.py @@ -105,7 +105,7 @@ def task(ctx, config): host = t.split('@')[-1] shortname = host.split('.')[0] from ..orchestra import remote as oremote - console = oremote.RemoteConsole( + console = oremote.getRemoteConsole( name=host, ipmiuser=ctx.teuthology_config['ipmi_user'], ipmipass=ctx.teuthology_config['ipmi_password'], -- 2.39.5