From: Zack Cerza Date: Wed, 23 Aug 2017 20:03:53 +0000 (-0600) Subject: Support reimaging with FOG X-Git-Tag: 1.1.0~375^2~7 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=708c141a2bb6587f8ef5d4b6fb5396a0afe7e51a;p=teuthology.git Support reimaging with FOG https://fogproject.org Signed-off-by: Zack Cerza --- diff --git a/docs/siteconfig.rst b/docs/siteconfig.rst index 05b35bbf8b..55b2e288aa 100644 --- a/docs/siteconfig.rst +++ b/docs/siteconfig.rst @@ -229,3 +229,10 @@ Here is a sample configuration with many of the options set and documented:: # Settings for [nsupdate-web](https://github.com/zmc/nsupdate-web) # Used by the [libcloud](https://libcloud.apache.org/) backend nsupdate_url: http://nsupdate.front.sepia.ceph.com/update + + # Settings for https://fogproject.org/ + fog: + endpoint: http://fog.example.com/fog + api_token: your_api_token + user_token: your_user_token + machine_types: ['mira', 'smithi'] diff --git a/teuthology/lock/cli.py b/teuthology/lock/cli.py index dfd50ca842..a3c3002a22 100644 --- a/teuthology/lock/cli.py +++ b/teuthology/lock/cli.py @@ -153,6 +153,9 @@ def main(ctx): ret = 1 if not ctx.f: return ret + elif not query.is_vm(machine): + teuthology.provision.reimage(ctx, machine) + keys.do_update_keys([machine]) else: machines_to_update.append(machine) teuthology.provision.create_if_vm( diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py index 64e1427eb1..309ae044c9 100644 --- a/teuthology/lock/ops.py +++ b/teuthology/lock/ops.py @@ -77,7 +77,8 @@ def lock_many(ctx, num, machine_type, user=None, description=None, # Only query for os_type/os_version if non-vps and non-libcloud, since # in that case we just create them. vm_types = ['vps'] + teuthology.provision.cloud.get_types() - if machine_type not in vm_types: + reimage_types = teuthology.provision.fog.get_types() + if machine_type not in vm_types + reimage_types: if os_type: data['os_type'] = os_type if os_version: @@ -106,6 +107,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None, unlock_one(ctx, machine, user) ok_machs = keys.do_update_keys(ok_machs.keys())[1] return ok_machs + elif machine_type in reimage_types: + reimaged = dict() + for machine in machines: + teuthology.provision.reimage(ctx, machine) + reimaged[machine] = machines[machine] + reimaged = keys.do_update_keys(reimaged.keys())[1] + return reimaged return machines elif response.status_code == 503: log.error('Insufficient nodes available to lock %d %s nodes.', diff --git a/teuthology/provision/__init__.py b/teuthology/provision/__init__.py index b5512ea137..261bc018c0 100644 --- a/teuthology/provision/__init__.py +++ b/teuthology/provision/__init__.py @@ -5,6 +5,7 @@ from ..misc import decanonicalize_hostname, get_distro, get_distro_version import cloud import downburst +import fog import openstack import os @@ -18,6 +19,13 @@ def _logfile(ctx, shortname): shortname + '.downburst.log') +def reimage(ctx, machine_name): + os_type = get_distro(ctx) + os_version = get_distro_version(ctx) + fog_obj = fog.FOG(machine_name, os_type, os_version) + return fog_obj.create() + + def create_if_vm(ctx, machine_name, _downburst=None): """ Use downburst to create a virtual machine diff --git a/teuthology/provision/fog.py b/teuthology/provision/fog.py new file mode 100644 index 0000000000..202ef0cf06 --- /dev/null +++ b/teuthology/provision/fog.py @@ -0,0 +1,289 @@ +import json +import logging +import requests +import socket + +from datetime import datetime +from paramiko import SSHException +from StringIO import StringIO + +import teuthology.orchestra + +from ..config import config +from ..contextutil import safe_while +from teuthology.exceptions import MaxWhileTries +from teuthology import misc + +log = logging.getLogger(__name__) + + +def enabled(warn=False): + """ + Check for required FOG settings + + :param warn: Whether or not to log a message containing unset parameters + :returns: True if they are present; False if they are not + """ + fog_conf = config.get('fog', dict()) + params = ['endpoint', 'api_token', 'user_token', 'machine_types'] + unset = [param for param in params if not fog_conf.get(param)] + if unset and warn: + log.warn( + "FOG disabled; set the following config options to enable: %s", + ' '.join(unset), + ) + return (unset == []) + + +def get_types(): + """ + Fetch and parse config.fog['machine_types'] + + :returns: The list of FOG-configured machine types. An empty list if FOG is + not configured. + """ + if not enabled(): + return [] + fog_conf = config.get('fog', dict()) + types = fog_conf.get('machine_types', '') + if not isinstance(types, list): + types = types.split(',') + return [type_ for type_ in types if type_] + + +class FOG(object): + """ + Reimage bare-metal machines with https://fogproject.org/ + """ + timestamp_format = '%Y-%m-%d %H:%M:%S' + + def __init__(self, name, os_type, os_version): + self.remote = teuthology.orchestra.remote.Remote( + misc.canonicalize_hostname(name)) + self.name = self.remote.hostname + self.shortname = self.remote.shortname + self.os_type = os_type + self.os_version = os_version + self.log = log.getChild(self.shortname) + + def create(self): + """ + Initiate deployment and wait until completion + """ + if not enabled(): + raise RuntimeError("FOG is not configured!") + host_data = self.get_host_data() + host_id = int(host_data['id']) + self.set_image(host_id) + task_id = self.schedule_deploy_task(host_id) + # Use power_off/power_on because other methods call _wait_for_login, + # which will not work here since the newly-imaged host will have an + # incorrect hostname + self.remote.console.power_off() + self.remote.console.power_on() + self.wait_for_deploy_task(task_id) + self._wait_for_ready() + self._fix_hostname() + self.log.info("Deploy complete!") + + def do_request(self, url_suffix, data=None, method='GET', verify=True): + """ + A convenience method to submit a request to the FOG server + :param url_suffix: The portion of the URL to append to the endpoint, + e.g. '/system/info' + :param data: Optional JSON data to submit with the request + :param method: The HTTP method to use for the request (default: 'GET') + :param verify: Whether or not to raise an exception if the request is + unsuccessful (default: True) + :returns: A requests.models.Response object + """ + req_kwargs = dict( + headers={ + 'fog-api-token': config.fog['api_token'], + 'fog-user-token': config.fog['user_token'], + }, + ) + if data is not None: + req_kwargs['data'] = data + req = requests.Request( + method, + config.fog['endpoint'] + url_suffix, + **req_kwargs + ) + prepped = req.prepare() + resp = requests.Session().send(prepped) + if not resp.ok and resp.text: + self.log.error("%s: %s", resp.status_code, resp.text) + if verify: + resp.raise_for_status() + return resp + + def get_host_data(self): + """ + Locate the host we want to use, and return the FOG object which + represents it + :returns: A dict describing the host + """ + resp = self.do_request( + '/host', + data=json.dumps(dict(name=self.shortname)), + ) + obj = resp.json() + if obj['count'] == 0: + raise RuntimeError("Host %s not found!" % self.shortname) + if obj['count'] > 1: + raise RuntimeError( + "More than one host found for %s" % self.shortname) + return obj['hosts'][0] + + def get_image_data(self): + """ + Locate the image we want to use, and return the FOG object which + represents it + :returns: A dict describing the image + """ + name = '_'.join([ + self.remote.machine_type, self.os_type.lower(), self.os_version]) + resp = self.do_request( + '/image', + data=json.dumps(dict(name=name)), + ) + obj = resp.json() + if not obj['count']: + raise RuntimeError( + "Could not find an image for %s %s", + self.os_type, + self.os_version, + ) + return obj['images'][0] + + def set_image(self, host_id): + """ + Tell FOG to use the proper image on the next deploy + :param host_id: The id of the host to deploy + """ + image_data = self.get_image_data() + image_id = int(image_data['id']) + self.do_request( + '/host/%s' % host_id, + method='PUT', + data=json.dumps(dict(imageID=image_id)), + ) + + def schedule_deploy_task(self, host_id): + """ + :param host_id: The id of the host to deploy + :returns: The id of the scheduled task + """ + self.log.info( + "Scheduling deploy of %s %s", + self.os_type, self.os_version) + # First, we need to find the right tasktype ID + resp = self.do_request( + '/tasktype', + data=json.dumps(dict(name='deploy')), + ) + tasktypes = [obj for obj in resp.json()['tasktypes'] + if obj['name'].lower() == 'deploy'] + deploy_id = int(tasktypes[0]['id']) + # Next, schedule the task + resp = self.do_request( + '/host/%i/task' % host_id, + method='POST', + data='{"taskTypeID": %i}' % deploy_id, + ) + host_tasks = self.get_deploy_tasks() + for task in host_tasks: + timestamp = task['createdTime'] + time_delta = ( + datetime.utcnow() - datetime.strptime( + timestamp, self.timestamp_format) + ).total_seconds() + # There should only be one deploy task matching our host. Just in + # case there are multiple, select a very recent one. + if time_delta < 5: + return task['id'] + + def get_deploy_tasks(self): + """ + :returns: A list of deploy tasks which are active on our host + """ + resp = self.do_request('/task/active') + tasks = resp.json()['tasks'] + host_tasks = [obj for obj in tasks + if obj['host']['name'] == self.shortname] + return host_tasks + + def deploy_task_active(self, task_id): + """ + :param task_id: The id of the task to query + :returns: True if the task is active + """ + host_tasks = self.get_deploy_tasks() + return any( + [task['id'] == task_id for task in host_tasks] + ) + + def wait_for_deploy_task(self, task_id): + """ + Wait until the specified task is no longer active (i.e., it has + completed) + """ + self.log.info("Waiting for deploy to finish") + with safe_while(sleep=15, tries=40) as proceed: + while proceed(): + if not self.deploy_task_active(task_id): + break + + def _wait_for_ready(self): + """ Attempt to connect to the machine via SSH """ + with safe_while(sleep=6, tries=50) as proceed: + while proceed(): + try: + self.remote.connect() + break + except ( + socket.error, + SSHException, + MaxWhileTries, + EOFError, + ): + pass + + def _fix_hostname(self): + """ + After a reimage, the host will still have the hostname of the machine + used to create the image initially. Fix that by making a call to + /binhostname and tweaking /etc/hosts. + """ + proc = self.remote.run(args='hostname', stdout=StringIO()) + wrong_hostname = proc.stdout.read().strip() + proc = self.remote.run( + args='grep %s /etc/hosts' % wrong_hostname, + stdout=StringIO(), + check_status=False, + ) + if proc.returncode == 0: + wrong_ip = proc.stdout.readlines()[0].split(' ')[0] + self.remote.run(args="sudo hostname %s" % self.shortname) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % ( + wrong_hostname, self.shortname), + ) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % ( + wrong_ip, self.remote.ip_address), + ) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hostname" % ( + wrong_hostname, self.shortname), + check_status=False, + ) + self.remote.run( + args="sudo hostname %s" % self.shortname, + check_status=False, + ) + + def destroy(self): + """A no-op; we just leave idle nodes as-is""" + pass diff --git a/teuthology/provision/test/test_fog.py b/teuthology/provision/test/test_fog.py new file mode 100644 index 0000000000..339b9eda08 --- /dev/null +++ b/teuthology/provision/test/test_fog.py @@ -0,0 +1,276 @@ +from copy import deepcopy +from datetime import datetime +from mock import patch, DEFAULT, PropertyMock +from pytest import raises, mark + +from teuthology.config import config +from teuthology.exceptions import MaxWhileTries +from teuthology.provision import fog + + +test_config = dict(fog=dict( + endpoint='http://fog.example.com/fog', + api_token='API_TOKEN', + user_token='USER_TOKEN', + machine_types='type1,type2', +)) + + +class TestFOG(object): + klass = fog.FOG + + def setup(self): + config.load() + config.update(deepcopy(test_config)) + self.start_patchers() + + def start_patchers(self): + self.patchers = dict() + self.patchers['m_sleep'] = patch( + 'time.sleep', + ) + self.patchers['m_requests_Session_send'] = patch( + 'requests.Session.send', + ) + self.patchers['m_Remote_connect'] = patch( + 'teuthology.orchestra.remote.Remote.connect' + ) + self.patchers['m_Remote_run'] = patch( + 'teuthology.orchestra.remote.Remote.run' + ) + self.patchers['m_Remote_console'] = patch( + 'teuthology.orchestra.remote.Remote.console', + new_callable=PropertyMock, + ) + self.patchers['m_Remote_hostname'] = patch( + 'teuthology.orchestra.remote.Remote.hostname', + new_callable=PropertyMock, + ) + self.patchers['m_Remote_machine_type'] = patch( + 'teuthology.orchestra.remote.Remote.machine_type', + new_callable=PropertyMock, + ) + self.mocks = dict() + for name, patcher in self.patchers.items(): + self.mocks[name] = patcher.start() + + def teardown(self): + for patcher in self.patchers.values(): + patcher.stop() + + @mark.parametrize('enabled', [True, False]) + def test_get_types(self, enabled): + with patch('teuthology.provision.fog.enabled') as m_enabled: + m_enabled.return_value = enabled + types = fog.get_types() + if enabled: + assert types == test_config['fog']['machine_types'].split(',') + else: + assert types == [] + + def test_disabled(self): + config.fog['endpoint'] = None + obj = self.klass('name.fqdn', 'type', '1.0') + with raises(RuntimeError): + obj.create() + + def test_init(self): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + obj = self.klass('name.fqdn', 'type', '1.0') + assert obj.name == 'name.fqdn' + assert obj.shortname == 'name' + assert obj.os_type == 'type' + assert obj.os_version == '1.0' + + def test_create(self): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + self.mocks['m_Remote_machine_type'].return_value = 'type1' + obj = self.klass('name.fqdn', 'type', '1.0') + host_id = 99 + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_host_data=DEFAULT, + set_image=DEFAULT, + schedule_deploy_task=DEFAULT, + wait_for_deploy_task=DEFAULT, + _wait_for_ready=DEFAULT, + _fix_hostname=DEFAULT, + ) as local_mocks: + local_mocks['get_host_data'].return_value = dict(id=host_id) + obj.create() + assert local_mocks['get_host_data'].called_once_with() + assert local_mocks['set_image'].called_once_with(host_id) + assert local_mocks['schedule_deploy_task']\ + .called_once_with(host_id) + assert local_mocks['wait_for_deploy_task'].called_once_with() + assert local_mocks['_wait_for_ready'].called_once_with() + assert local_mocks['_fix_hostname'].called_once_with() + assert self.mocks['m_Remote_console']\ + .return_value.power_off.called_once_with() + assert self.mocks['m_Remote_console']\ + .return_value.power_on.called_once_with() + + def test_do_request(self): + obj = self.klass('name.fqdn', 'type', '1.0') + obj.do_request('test_url', data='DATA', method='GET') + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + 'test_url' + assert req.method == 'GET' + assert req.headers['fog-api-token'] == test_config['fog']['api_token'] + assert req.headers['fog-user-token'] == test_config['fog']['user_token'] + assert req.body == 'DATA' + + @mark.parametrize( + 'count', + [0, 1, 2], + ) + def test_get_host_data(self, count): + host_objs = [dict(id=i) for i in range(count)] + resp_obj = dict(count=count, hosts=host_objs) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + obj = self.klass('name.fqdn', 'type', '1.0') + if count != 1: + with raises(RuntimeError): + result = obj.get_host_data() + return + result = obj.get_host_data() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + '/host' + assert req.body == '{"name": "name"}' + assert result == host_objs[0] + + @mark.parametrize( + 'count', + [0, 1, 2], + ) + def test_get_image_data(self, count): + img_objs = [dict(id=i) for i in range(count)] + resp_obj = dict(count=count, images=img_objs) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + self.mocks['m_Remote_machine_type'].return_value = 'type1' + obj = self.klass('name.fqdn', 'windows', 'xp') + if count < 1: + with raises(RuntimeError): + result = obj.get_image_data() + return + result = obj.get_image_data() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + '/image' + assert req.body == '{"name": "type1_windows_xp"}' + assert result == img_objs[0] + + def test_set_image(self): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + self.mocks['m_Remote_machine_type'].return_value = 'type1' + host_id = 999 + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_image_data=DEFAULT, + do_request=DEFAULT, + ) as local_mocks: + local_mocks['get_image_data'].return_value = dict(id='13') + obj.set_image(host_id) + assert local_mocks['do_request'].called_once_with( + '/host/999', 'put', '{"imageID": "13"}', + ) + + def test_schedule_deploy_task(self): + host_id = 12 + tasktype_id = 6 + task_id = 5 + tasktype_result = dict(tasktypes=[dict(name='deploy', id=tasktype_id)]) + schedule_result = dict() + host_tasks = [dict( + createdTime=datetime.strftime( + datetime.utcnow(), self.klass.timestamp_format), + id=task_id, + )] + self.mocks['m_requests_Session_send']\ + .return_value.json.side_effect = [ + tasktype_result, schedule_result, + ] + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_deploy_tasks=DEFAULT, + ) as local_mocks: + local_mocks['get_deploy_tasks'].return_value = host_tasks + obj = self.klass('name.fqdn', 'type', '1.0') + result = obj.schedule_deploy_task(host_id) + assert local_mocks['get_deploy_tasks'].called_once_with() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 2 + assert result == task_id + + def test_get_deploy_tasks(self): + obj = self.klass('name.fqdn', 'type', '1.0') + resp_obj = dict( + count=2, + tasks=[ + dict(host=dict(name='notme')), + dict(host=dict(name='name')), + ] + ) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + result = obj.get_deploy_tasks() + assert result[0]['host']['name'] == 'name' + + @mark.parametrize( + 'active_ids', + [ + [2, 4, 6, 8], + [1], + [], + ] + ) + def test_deploy_task_active(self, active_ids): + our_task_id = 4 + result_objs = [dict(id=task_id) for task_id in active_ids] + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_deploy_tasks=DEFAULT, + ) as local_mocks: + local_mocks['get_deploy_tasks'].return_value = result_objs + result = obj.deploy_task_active(our_task_id) + assert result is (our_task_id in active_ids) + + @mark.parametrize( + 'tries', + [3, 45], + ) + def test_wait_for_deploy_task(self, tries): + wait_results = [True for i in range(tries)] + [False] + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + deploy_task_active=DEFAULT, + ) as local_mocks: + local_mocks['deploy_task_active'].side_effect = wait_results + if tries >= 40: + with raises(MaxWhileTries): + obj.wait_for_deploy_task(9) + return + obj.wait_for_deploy_task(9) + assert len(local_mocks['deploy_task_active'].call_args_list) == \ + tries + 1 + + @mark.parametrize( + 'tries', + [1, 51], + ) + def test_wait_for_ready(self, tries): + connect_results = [MaxWhileTries for i in range(tries)] + [True] + obj = self.klass('name.fqdn', 'type', '1.0') + self.mocks['m_Remote_connect'].side_effect = connect_results + if tries >= 50: + with raises(MaxWhileTries): + obj._wait_for_ready() + return + obj._wait_for_ready() + assert len(self.mocks['m_Remote_connect'].call_args_list) == tries + 1