From 4ea434fc5f8972def50b6d8ae42560c3a65b1c34 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Thu, 17 Mar 2016 14:27:09 -0600 Subject: [PATCH] Move internal.lock_machines to its own module Signed-off-by: Zack Cerza --- teuthology/task/internal/__init__.py | 154 +-------------------- teuthology/task/internal/lock_machines.py | 158 ++++++++++++++++++++++ 2 files changed, 163 insertions(+), 149 deletions(-) create mode 100644 teuthology/task/internal/lock_machines.py diff --git a/teuthology/task/internal/__init__.py b/teuthology/task/internal/__init__.py index 092a1cf3e3..5187402d31 100644 --- a/teuthology/task/internal/__init__.py +++ b/teuthology/task/internal/__init__.py @@ -11,18 +11,17 @@ import time import yaml import subprocess -from teuthology import lockstatus from teuthology import lock +from teuthology import lockstatus from teuthology import misc -from teuthology import provision from teuthology.packaging import get_builder_project +from teuthology import report +from teuthology.config import config as teuth_config from teuthology.exceptions import VersionNotFoundError from teuthology.job_status import get_status, set_status -from teuthology.config import config as teuth_config +from teuthology.orchestra import cluster, remote, run from teuthology.parallel import parallel -from ..orchestra import cluster, remote, run -from .. import report -from . import ansible +from teuthology.task import ansible log = logging.getLogger(__name__) @@ -56,149 +55,6 @@ def base(ctx, config): ) -@contextlib.contextmanager -def lock_machines(ctx, config): - """ - Lock machines. Called when the teuthology run finds and locks - new machines. This is not called if the one has teuthology-locked - machines and placed those keys in the Targets section of a yaml file. - """ - # It's OK for os_type and os_version to be None here. If we're trying - # to lock a bare metal machine, we'll take whatever is available. If - # we want a vps, defaults will be provided by misc.get_distro and - # misc.get_distro_version in provision.create_if_vm - os_type = ctx.config.get("os_type") - os_version = ctx.config.get("os_version") - arch = ctx.config.get('arch') - log.info('Locking machines...') - assert isinstance(config[0], int), 'config[0] must be an integer' - machine_type = config[1] - total_requested = config[0] - # We want to make sure there are always this many machines available - reserved = teuth_config.reserve_machines - assert isinstance(reserved, int), 'reserve_machines must be integer' - assert (reserved >= 0), 'reserve_machines should >= 0' - - # change the status during the locking process - report.try_push_job_info(ctx.config, dict(status='waiting')) - - all_locked = dict() - requested = total_requested - while True: - # get a candidate list of machines - machines = lock.list_locks(machine_type=machine_type, up=True, - locked=False, count=requested + reserved) - if machines is None: - if ctx.block: - log.error('Error listing machines, trying again') - time.sleep(20) - continue - else: - raise RuntimeError('Error listing machines') - - # make sure there are machines for non-automated jobs to run - if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): - if ctx.block: - log.info( - 'waiting for more %s machines to be free (need %s + %s, have %s)...', - machine_type, - reserved, - requested, - len(machines), - ) - time.sleep(10) - continue - else: - assert 0, ('not enough machines free; need %s + %s, have %s' % - (reserved, requested, len(machines))) - - try: - newly_locked = lock.lock_many(ctx, requested, machine_type, - ctx.owner, ctx.archive, os_type, - os_version, arch) - except Exception: - # Lock failures should map to the 'dead' status instead of 'fail' - set_status(ctx.summary, 'dead') - raise - all_locked.update(newly_locked) - log.info( - '{newly_locked} {mtype} machines locked this try, ' - '{total_locked}/{total_requested} locked so far'.format( - newly_locked=len(newly_locked), - mtype=machine_type, - total_locked=len(all_locked), - total_requested=total_requested, - ) - ) - if len(all_locked) == total_requested: - vmlist = [] - for lmach in all_locked: - if misc.is_vm(lmach): - vmlist.append(lmach) - if vmlist: - log.info('Waiting for virtual machines to come up') - keys_dict = dict() - loopcount = 0 - while len(keys_dict) != len(vmlist): - loopcount += 1 - time.sleep(10) - keys_dict = misc.ssh_keyscan(vmlist) - log.info('virtual machine is still unavailable') - if loopcount == 40: - loopcount = 0 - log.info('virtual machine(s) still not up, ' + - 'recreating unresponsive ones.') - for guest in vmlist: - if guest not in keys_dict.keys(): - log.info('recreating: ' + guest) - full_name = misc.canonicalize_hostname(guest) - provision.destroy_if_vm(ctx, full_name) - provision.create_if_vm(ctx, full_name) - if lock.do_update_keys(keys_dict): - log.info("Error in virtual machine keys") - newscandict = {} - for dkey in all_locked.iterkeys(): - stats = lockstatus.get_status(dkey) - newscandict[dkey] = stats['ssh_pub_key'] - ctx.config['targets'] = newscandict - else: - ctx.config['targets'] = all_locked - locked_targets = yaml.safe_dump( - ctx.config['targets'], - default_flow_style=False - ).splitlines() - log.info('\n '.join(['Locked targets:', ] + locked_targets)) - # successfully locked machines, change status back to running - report.try_push_job_info(ctx.config, dict(status='running')) - break - elif not ctx.block: - assert 0, 'not enough machines are available' - else: - requested = requested - len(newly_locked) - assert requested > 0, "lock_machines: requested counter went" \ - "negative, this shouldn't happen" - - log.info( - "{total} machines locked ({new} new); need {more} more".format( - total=len(all_locked), new=len(newly_locked), more=requested) - ) - log.warn('Could not lock enough machines, waiting...') - time.sleep(10) - try: - yield - finally: - # If both unlock_on_failure and nuke-on-error are set, don't unlock now - # because we're just going to nuke (and unlock) later. - unlock_on_failure = ( - ctx.config.get('unlock_on_failure', False) - and not ctx.config.get('nuke-on-error', False) - ) - if get_status(ctx.summary) == 'pass' or unlock_on_failure: - log.info('Unlocking machines...') - for machine in ctx.config['targets'].iterkeys(): - lock.unlock_one(ctx, machine, ctx.owner, ctx.archive) - - def save_config(ctx, config): """ Store the config in a yaml file diff --git a/teuthology/task/internal/lock_machines.py b/teuthology/task/internal/lock_machines.py new file mode 100644 index 0000000000..fcf8d21172 --- /dev/null +++ b/teuthology/task/internal/lock_machines.py @@ -0,0 +1,158 @@ +import contextlib +import logging +import time +import yaml + +from teuthology import lock +from teuthology import lockstatus +from teuthology import misc +from teuthology import provision +from teuthology import report + +from teuthology.config import config as teuth_config +from teuthology.job_status import get_status, set_status + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def lock_machines(ctx, config): + """ + Lock machines. Called when the teuthology run finds and locks + new machines. This is not called if the one has teuthology-locked + machines and placed those keys in the Targets section of a yaml file. + """ + # It's OK for os_type and os_version to be None here. If we're trying + # to lock a bare metal machine, we'll take whatever is available. If + # we want a vps, defaults will be provided by misc.get_distro and + # misc.get_distro_version in provision.create_if_vm + os_type = ctx.config.get("os_type") + os_version = ctx.config.get("os_version") + arch = ctx.config.get('arch') + log.info('Locking machines...') + assert isinstance(config[0], int), 'config[0] must be an integer' + machine_type = config[1] + total_requested = config[0] + # We want to make sure there are always this many machines available + reserved = teuth_config.reserve_machines + assert isinstance(reserved, int), 'reserve_machines must be integer' + assert (reserved >= 0), 'reserve_machines should >= 0' + + # change the status during the locking process + report.try_push_job_info(ctx.config, dict(status='waiting')) + + all_locked = dict() + requested = total_requested + while True: + # get a candidate list of machines + machines = lock.list_locks(machine_type=machine_type, up=True, + locked=False, count=requested + reserved) + if machines is None: + if ctx.block: + log.error('Error listing machines, trying again') + time.sleep(20) + continue + else: + raise RuntimeError('Error listing machines') + + # make sure there are machines for non-automated jobs to run + if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): + if ctx.block: + log.info( + 'waiting for more %s machines to be free (need %s + %s, have %s)...', + machine_type, + reserved, + requested, + len(machines), + ) + time.sleep(10) + continue + else: + assert 0, ('not enough machines free; need %s + %s, have %s' % + (reserved, requested, len(machines))) + + try: + newly_locked = lock.lock_many(ctx, requested, machine_type, + ctx.owner, ctx.archive, os_type, + os_version, arch) + except Exception: + # Lock failures should map to the 'dead' status instead of 'fail' + set_status(ctx.summary, 'dead') + raise + all_locked.update(newly_locked) + log.info( + '{newly_locked} {mtype} machines locked this try, ' + '{total_locked}/{total_requested} locked so far'.format( + newly_locked=len(newly_locked), + mtype=machine_type, + total_locked=len(all_locked), + total_requested=total_requested, + ) + ) + if len(all_locked) == total_requested: + vmlist = [] + for lmach in all_locked: + if misc.is_vm(lmach): + vmlist.append(lmach) + if vmlist: + log.info('Waiting for virtual machines to come up') + keys_dict = dict() + loopcount = 0 + while len(keys_dict) != len(vmlist): + loopcount += 1 + time.sleep(10) + keys_dict = misc.ssh_keyscan(vmlist) + log.info('virtual machine is still unavailable') + if loopcount == 40: + loopcount = 0 + log.info('virtual machine(s) still not up, ' + + 'recreating unresponsive ones.') + for guest in vmlist: + if guest not in keys_dict.keys(): + log.info('recreating: ' + guest) + full_name = misc.canonicalize_hostname(guest) + provision.destroy_if_vm(ctx, full_name) + provision.create_if_vm(ctx, full_name) + if lock.do_update_keys(keys_dict): + log.info("Error in virtual machine keys") + newscandict = {} + for dkey in all_locked.iterkeys(): + stats = lockstatus.get_status(dkey) + newscandict[dkey] = stats['ssh_pub_key'] + ctx.config['targets'] = newscandict + else: + ctx.config['targets'] = all_locked + locked_targets = yaml.safe_dump( + ctx.config['targets'], + default_flow_style=False + ).splitlines() + log.info('\n '.join(['Locked targets:', ] + locked_targets)) + # successfully locked machines, change status back to running + report.try_push_job_info(ctx.config, dict(status='running')) + break + elif not ctx.block: + assert 0, 'not enough machines are available' + else: + requested = requested - len(newly_locked) + assert requested > 0, "lock_machines: requested counter went" \ + "negative, this shouldn't happen" + + log.info( + "{total} machines locked ({new} new); need {more} more".format( + total=len(all_locked), new=len(newly_locked), more=requested) + ) + log.warn('Could not lock enough machines, waiting...') + time.sleep(10) + try: + yield + finally: + # If both unlock_on_failure and nuke-on-error are set, don't unlock now + # because we're just going to nuke (and unlock) later. + unlock_on_failure = ( + ctx.config.get('unlock_on_failure', False) + and not ctx.config.get('nuke-on-error', False) + ) + if get_status(ctx.summary) == 'pass' or unlock_on_failure: + log.info('Unlocking machines...') + for machine in ctx.config['targets'].iterkeys(): + lock.unlock_one(ctx, machine, ctx.owner, ctx.archive) -- 2.39.5