]> git.apps.os.sepia.ceph.com Git - teuthology.git/commitdiff
Support reimaging with FOG
authorZack Cerza <zack@redhat.com>
Wed, 23 Aug 2017 20:03:53 +0000 (14:03 -0600)
committerZack Cerza <zack@redhat.com>
Thu, 30 Nov 2017 19:15:42 +0000 (12:15 -0700)
https://fogproject.org

Signed-off-by: Zack Cerza <zack@redhat.com>
docs/siteconfig.rst
teuthology/lock/cli.py
teuthology/lock/ops.py
teuthology/provision/__init__.py
teuthology/provision/fog.py [new file with mode: 0644]
teuthology/provision/test/test_fog.py [new file with mode: 0644]

index 05b35bbf8b93ac143be0842004c024a215f6bb57..55b2e288aab68219c88261f23c940bef7488a028 100644 (file)
@@ -229,3 +229,10 @@ Here is a sample configuration with many of the options set and documented::
     # Settings for [nsupdate-web](https://github.com/zmc/nsupdate-web)
     # Used by the [libcloud](https://libcloud.apache.org/) backend
     nsupdate_url: http://nsupdate.front.sepia.ceph.com/update
+
+    # Settings for https://fogproject.org/
+    fog:
+      endpoint: http://fog.example.com/fog
+      api_token: your_api_token
+      user_token: your_user_token
+      machine_types: ['mira', 'smithi']
index dfd50ca842d579b1cea32bafbcca6541b88bc82a..a3c3002a2219bc24b83aeaf2fae05b972e3101a0 100644 (file)
@@ -153,6 +153,9 @@ def main(ctx):
                 ret = 1
                 if not ctx.f:
                     return ret
+            elif not query.is_vm(machine):
+                teuthology.provision.reimage(ctx, machine)
+                keys.do_update_keys([machine])
             else:
                 machines_to_update.append(machine)
                 teuthology.provision.create_if_vm(
index 64e1427eb16e451e8ef1283a09433c10a197b23b..309ae044c9da3c41c47625754b6dd7c4c753651e 100644 (file)
@@ -77,7 +77,8 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
         # Only query for os_type/os_version if non-vps and non-libcloud, since
         # in that case we just create them.
         vm_types = ['vps'] + teuthology.provision.cloud.get_types()
-        if machine_type not in vm_types:
+        reimage_types = teuthology.provision.fog.get_types()
+        if machine_type not in vm_types + reimage_types:
             if os_type:
                 data['os_type'] = os_type
             if os_version:
@@ -106,6 +107,13 @@ def lock_many(ctx, num, machine_type, user=None, description=None,
                         unlock_one(ctx, machine, user)
                     ok_machs = keys.do_update_keys(ok_machs.keys())[1]
                 return ok_machs
+            elif machine_type in reimage_types:
+                reimaged = dict()
+                for machine in machines:
+                    teuthology.provision.reimage(ctx, machine)
+                    reimaged[machine] = machines[machine]
+                reimaged = keys.do_update_keys(reimaged.keys())[1]
+                return reimaged
             return machines
         elif response.status_code == 503:
             log.error('Insufficient nodes available to lock %d %s nodes.',
index b5512ea137767810859bc1e9c55088e15555682c..261bc018c0e340a5a9f67b63fa8dfa1ceea92068 100644 (file)
@@ -5,6 +5,7 @@ from ..misc import decanonicalize_hostname, get_distro, get_distro_version
 
 import cloud
 import downburst
+import fog
 import openstack
 import os
 
@@ -18,6 +19,13 @@ def _logfile(ctx, shortname):
                             shortname + '.downburst.log')
 
 
+def reimage(ctx, machine_name):
+    os_type = get_distro(ctx)
+    os_version = get_distro_version(ctx)
+    fog_obj = fog.FOG(machine_name, os_type, os_version)
+    return fog_obj.create()
+
+
 def create_if_vm(ctx, machine_name, _downburst=None):
     """
     Use downburst to create a virtual machine
diff --git a/teuthology/provision/fog.py b/teuthology/provision/fog.py
new file mode 100644 (file)
index 0000000..202ef0c
--- /dev/null
@@ -0,0 +1,289 @@
+import json
+import logging
+import requests
+import socket
+
+from datetime import datetime
+from paramiko import SSHException
+from StringIO import StringIO
+
+import teuthology.orchestra
+
+from ..config import config
+from ..contextutil import safe_while
+from teuthology.exceptions import MaxWhileTries
+from teuthology import misc
+
+log = logging.getLogger(__name__)
+
+
+def enabled(warn=False):
+    """
+    Check for required FOG settings
+
+    :param warn: Whether or not to log a message containing unset parameters
+    :returns: True if they are present; False if they are not
+    """
+    fog_conf = config.get('fog', dict())
+    params = ['endpoint', 'api_token', 'user_token', 'machine_types']
+    unset = [param for param in params if not fog_conf.get(param)]
+    if unset and warn:
+        log.warn(
+            "FOG disabled; set the following config options to enable: %s",
+            ' '.join(unset),
+        )
+    return (unset == [])
+
+
+def get_types():
+    """
+    Fetch and parse config.fog['machine_types']
+
+    :returns: The list of FOG-configured machine types. An empty list if FOG is
+              not configured.
+    """
+    if not enabled():
+        return []
+    fog_conf = config.get('fog', dict())
+    types = fog_conf.get('machine_types', '')
+    if not isinstance(types, list):
+        types = types.split(',')
+    return [type_ for type_ in types if type_]
+
+
+class FOG(object):
+    """
+    Reimage bare-metal machines with https://fogproject.org/
+    """
+    timestamp_format = '%Y-%m-%d %H:%M:%S'
+
+    def __init__(self, name, os_type, os_version):
+        self.remote = teuthology.orchestra.remote.Remote(
+            misc.canonicalize_hostname(name))
+        self.name = self.remote.hostname
+        self.shortname = self.remote.shortname
+        self.os_type = os_type
+        self.os_version = os_version
+        self.log = log.getChild(self.shortname)
+
+    def create(self):
+        """
+        Initiate deployment and wait until completion
+        """
+        if not enabled():
+            raise RuntimeError("FOG is not configured!")
+        host_data = self.get_host_data()
+        host_id = int(host_data['id'])
+        self.set_image(host_id)
+        task_id = self.schedule_deploy_task(host_id)
+        # Use power_off/power_on because other methods call _wait_for_login,
+        # which will not work here since the newly-imaged host will have an
+        # incorrect hostname
+        self.remote.console.power_off()
+        self.remote.console.power_on()
+        self.wait_for_deploy_task(task_id)
+        self._wait_for_ready()
+        self._fix_hostname()
+        self.log.info("Deploy complete!")
+
+    def do_request(self, url_suffix, data=None, method='GET', verify=True):
+        """
+        A convenience method to submit a request to the FOG server
+        :param url_suffix: The portion of the URL to append to the endpoint,
+                           e.g.  '/system/info'
+        :param data: Optional JSON data to submit with the request
+        :param method: The HTTP method to use for the request (default: 'GET')
+        :param verify: Whether or not to raise an exception if the request is
+                       unsuccessful (default: True)
+        :returns: A requests.models.Response object
+        """
+        req_kwargs = dict(
+            headers={
+                'fog-api-token': config.fog['api_token'],
+                'fog-user-token': config.fog['user_token'],
+            },
+        )
+        if data is not None:
+            req_kwargs['data'] = data
+        req = requests.Request(
+            method,
+            config.fog['endpoint'] + url_suffix,
+            **req_kwargs
+        )
+        prepped = req.prepare()
+        resp = requests.Session().send(prepped)
+        if not resp.ok and resp.text:
+            self.log.error("%s: %s", resp.status_code, resp.text)
+        if verify:
+            resp.raise_for_status()
+        return resp
+
+    def get_host_data(self):
+        """
+        Locate the host we want to use, and return the FOG object which
+        represents it
+        :returns: A dict describing the host
+        """
+        resp = self.do_request(
+            '/host',
+            data=json.dumps(dict(name=self.shortname)),
+        )
+        obj = resp.json()
+        if obj['count'] == 0:
+            raise RuntimeError("Host %s not found!" % self.shortname)
+        if obj['count'] > 1:
+            raise RuntimeError(
+                "More than one host found for %s" % self.shortname)
+        return obj['hosts'][0]
+
+    def get_image_data(self):
+        """
+        Locate the image we want to use, and return the FOG object which
+        represents it
+        :returns: A dict describing the image
+        """
+        name = '_'.join([
+            self.remote.machine_type, self.os_type.lower(), self.os_version])
+        resp = self.do_request(
+            '/image',
+            data=json.dumps(dict(name=name)),
+        )
+        obj = resp.json()
+        if not obj['count']:
+            raise RuntimeError(
+                "Could not find an image for %s %s",
+                self.os_type,
+                self.os_version,
+            )
+        return obj['images'][0]
+
+    def set_image(self, host_id):
+        """
+        Tell FOG to use the proper image on the next deploy
+        :param host_id: The id of the host to deploy
+        """
+        image_data = self.get_image_data()
+        image_id = int(image_data['id'])
+        self.do_request(
+            '/host/%s' % host_id,
+            method='PUT',
+            data=json.dumps(dict(imageID=image_id)),
+        )
+
+    def schedule_deploy_task(self, host_id):
+        """
+        :param host_id: The id of the host to deploy
+        :returns: The id of the scheduled task
+        """
+        self.log.info(
+            "Scheduling deploy of %s %s",
+            self.os_type, self.os_version)
+        # First, we need to find the right tasktype ID
+        resp = self.do_request(
+            '/tasktype',
+            data=json.dumps(dict(name='deploy')),
+        )
+        tasktypes = [obj for obj in resp.json()['tasktypes']
+                     if obj['name'].lower() == 'deploy']
+        deploy_id = int(tasktypes[0]['id'])
+        # Next, schedule the task
+        resp = self.do_request(
+            '/host/%i/task' % host_id,
+            method='POST',
+            data='{"taskTypeID": %i}' % deploy_id,
+        )
+        host_tasks = self.get_deploy_tasks()
+        for task in host_tasks:
+            timestamp = task['createdTime']
+            time_delta = (
+                datetime.utcnow() - datetime.strptime(
+                    timestamp, self.timestamp_format)
+            ).total_seconds()
+            # There should only be one deploy task matching our host. Just in
+            # case there are multiple, select a very recent one.
+            if time_delta < 5:
+                return task['id']
+
+    def get_deploy_tasks(self):
+        """
+        :returns: A list of deploy tasks which are active on our host
+        """
+        resp = self.do_request('/task/active')
+        tasks = resp.json()['tasks']
+        host_tasks = [obj for obj in tasks
+                      if obj['host']['name'] == self.shortname]
+        return host_tasks
+
+    def deploy_task_active(self, task_id):
+        """
+        :param task_id: The id of the task to query
+        :returns: True if the task is active
+        """
+        host_tasks = self.get_deploy_tasks()
+        return any(
+            [task['id'] == task_id for task in host_tasks]
+        )
+
+    def wait_for_deploy_task(self, task_id):
+        """
+        Wait until the specified task is no longer active (i.e., it has
+        completed)
+        """
+        self.log.info("Waiting for deploy to finish")
+        with safe_while(sleep=15, tries=40) as proceed:
+            while proceed():
+                if not self.deploy_task_active(task_id):
+                    break
+
+    def _wait_for_ready(self):
+        """ Attempt to connect to the machine via SSH """
+        with safe_while(sleep=6, tries=50) as proceed:
+            while proceed():
+                try:
+                    self.remote.connect()
+                    break
+                except (
+                    socket.error,
+                    SSHException,
+                    MaxWhileTries,
+                    EOFError,
+                ):
+                    pass
+
+    def _fix_hostname(self):
+        """
+        After a reimage, the host will still have the hostname of the machine
+        used to create the image initially. Fix that by making a call to
+        /binhostname and tweaking /etc/hosts.
+        """
+        proc = self.remote.run(args='hostname', stdout=StringIO())
+        wrong_hostname = proc.stdout.read().strip()
+        proc = self.remote.run(
+            args='grep %s /etc/hosts' % wrong_hostname,
+            stdout=StringIO(),
+            check_status=False,
+        )
+        if proc.returncode == 0:
+            wrong_ip = proc.stdout.readlines()[0].split(' ')[0]
+            self.remote.run(args="sudo hostname %s" % self.shortname)
+            self.remote.run(
+                args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % (
+                    wrong_hostname, self.shortname),
+            )
+            self.remote.run(
+                args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % (
+                    wrong_ip, self.remote.ip_address),
+            )
+        self.remote.run(
+            args="sudo sed -i -e 's/%s/%s/g' /etc/hostname" % (
+                wrong_hostname, self.shortname),
+            check_status=False,
+        )
+        self.remote.run(
+            args="sudo hostname %s" % self.shortname,
+            check_status=False,
+        )
+
+    def destroy(self):
+        """A no-op; we just leave idle nodes as-is"""
+        pass
diff --git a/teuthology/provision/test/test_fog.py b/teuthology/provision/test/test_fog.py
new file mode 100644 (file)
index 0000000..339b9ed
--- /dev/null
@@ -0,0 +1,276 @@
+from copy import deepcopy
+from datetime import datetime
+from mock import patch, DEFAULT, PropertyMock
+from pytest import raises, mark
+
+from teuthology.config import config
+from teuthology.exceptions import MaxWhileTries
+from teuthology.provision import fog
+
+
+test_config = dict(fog=dict(
+    endpoint='http://fog.example.com/fog',
+    api_token='API_TOKEN',
+    user_token='USER_TOKEN',
+    machine_types='type1,type2',
+))
+
+
+class TestFOG(object):
+    klass = fog.FOG
+
+    def setup(self):
+        config.load()
+        config.update(deepcopy(test_config))
+        self.start_patchers()
+
+    def start_patchers(self):
+        self.patchers = dict()
+        self.patchers['m_sleep'] = patch(
+            'time.sleep',
+        )
+        self.patchers['m_requests_Session_send'] = patch(
+            'requests.Session.send',
+        )
+        self.patchers['m_Remote_connect'] = patch(
+            'teuthology.orchestra.remote.Remote.connect'
+        )
+        self.patchers['m_Remote_run'] = patch(
+            'teuthology.orchestra.remote.Remote.run'
+        )
+        self.patchers['m_Remote_console'] = patch(
+            'teuthology.orchestra.remote.Remote.console',
+            new_callable=PropertyMock,
+        )
+        self.patchers['m_Remote_hostname'] = patch(
+            'teuthology.orchestra.remote.Remote.hostname',
+            new_callable=PropertyMock,
+        )
+        self.patchers['m_Remote_machine_type'] = patch(
+            'teuthology.orchestra.remote.Remote.machine_type',
+            new_callable=PropertyMock,
+        )
+        self.mocks = dict()
+        for name, patcher in self.patchers.items():
+            self.mocks[name] = patcher.start()
+
+    def teardown(self):
+        for patcher in self.patchers.values():
+            patcher.stop()
+
+    @mark.parametrize('enabled', [True, False])
+    def test_get_types(self, enabled):
+        with patch('teuthology.provision.fog.enabled') as m_enabled:
+            m_enabled.return_value = enabled
+            types = fog.get_types()
+        if enabled:
+            assert types == test_config['fog']['machine_types'].split(',')
+        else:
+            assert types == []
+
+    def test_disabled(self):
+        config.fog['endpoint'] = None
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        with raises(RuntimeError):
+            obj.create()
+
+    def test_init(self):
+        self.mocks['m_Remote_hostname'].return_value = 'name.fqdn'
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        assert obj.name == 'name.fqdn'
+        assert obj.shortname == 'name'
+        assert obj.os_type == 'type'
+        assert obj.os_version == '1.0'
+
+    def test_create(self):
+        self.mocks['m_Remote_hostname'].return_value = 'name.fqdn'
+        self.mocks['m_Remote_machine_type'].return_value = 'type1'
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        host_id = 99
+        with patch.multiple(
+            'teuthology.provision.fog.FOG',
+            get_host_data=DEFAULT,
+            set_image=DEFAULT,
+            schedule_deploy_task=DEFAULT,
+            wait_for_deploy_task=DEFAULT,
+            _wait_for_ready=DEFAULT,
+            _fix_hostname=DEFAULT,
+        ) as local_mocks:
+            local_mocks['get_host_data'].return_value = dict(id=host_id)
+            obj.create()
+            assert local_mocks['get_host_data'].called_once_with()
+            assert local_mocks['set_image'].called_once_with(host_id)
+            assert local_mocks['schedule_deploy_task']\
+                .called_once_with(host_id)
+            assert local_mocks['wait_for_deploy_task'].called_once_with()
+            assert local_mocks['_wait_for_ready'].called_once_with()
+            assert local_mocks['_fix_hostname'].called_once_with()
+        assert self.mocks['m_Remote_console']\
+            .return_value.power_off.called_once_with()
+        assert self.mocks['m_Remote_console']\
+            .return_value.power_on.called_once_with()
+
+    def test_do_request(self):
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        obj.do_request('test_url', data='DATA', method='GET')
+        assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1
+        req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0]
+        assert req.url == test_config['fog']['endpoint'] + 'test_url'
+        assert req.method == 'GET'
+        assert req.headers['fog-api-token'] == test_config['fog']['api_token']
+        assert req.headers['fog-user-token'] == test_config['fog']['user_token']
+        assert req.body == 'DATA'
+
+    @mark.parametrize(
+        'count',
+        [0, 1, 2],
+    )
+    def test_get_host_data(self, count):
+        host_objs = [dict(id=i) for i in range(count)]
+        resp_obj = dict(count=count, hosts=host_objs)
+        self.mocks['m_requests_Session_send']\
+            .return_value.json.return_value = resp_obj
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        if count != 1:
+            with raises(RuntimeError):
+                result = obj.get_host_data()
+            return
+        result = obj.get_host_data()
+        assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1
+        req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0]
+        assert req.url == test_config['fog']['endpoint'] + '/host'
+        assert req.body == '{"name": "name"}'
+        assert result == host_objs[0]
+
+    @mark.parametrize(
+        'count',
+        [0, 1, 2],
+    )
+    def test_get_image_data(self, count):
+        img_objs = [dict(id=i) for i in range(count)]
+        resp_obj = dict(count=count, images=img_objs)
+        self.mocks['m_requests_Session_send']\
+            .return_value.json.return_value = resp_obj
+        self.mocks['m_Remote_machine_type'].return_value = 'type1'
+        obj = self.klass('name.fqdn', 'windows', 'xp')
+        if count < 1:
+            with raises(RuntimeError):
+                result = obj.get_image_data()
+            return
+        result = obj.get_image_data()
+        assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1
+        req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0]
+        assert req.url == test_config['fog']['endpoint'] + '/image'
+        assert req.body == '{"name": "type1_windows_xp"}'
+        assert result == img_objs[0]
+
+    def test_set_image(self):
+        self.mocks['m_Remote_hostname'].return_value = 'name.fqdn'
+        self.mocks['m_Remote_machine_type'].return_value = 'type1'
+        host_id = 999
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        with patch.multiple(
+            'teuthology.provision.fog.FOG',
+            get_image_data=DEFAULT,
+            do_request=DEFAULT,
+        ) as local_mocks:
+            local_mocks['get_image_data'].return_value = dict(id='13')
+            obj.set_image(host_id)
+            assert local_mocks['do_request'].called_once_with(
+                '/host/999', 'put', '{"imageID": "13"}',
+            )
+
+    def test_schedule_deploy_task(self):
+        host_id = 12
+        tasktype_id = 6
+        task_id = 5
+        tasktype_result = dict(tasktypes=[dict(name='deploy', id=tasktype_id)])
+        schedule_result = dict()
+        host_tasks = [dict(
+            createdTime=datetime.strftime(
+                datetime.utcnow(), self.klass.timestamp_format),
+            id=task_id,
+        )]
+        self.mocks['m_requests_Session_send']\
+            .return_value.json.side_effect = [
+            tasktype_result, schedule_result,
+        ]
+        with patch.multiple(
+            'teuthology.provision.fog.FOG',
+            get_deploy_tasks=DEFAULT,
+        ) as local_mocks:
+            local_mocks['get_deploy_tasks'].return_value = host_tasks
+            obj = self.klass('name.fqdn', 'type', '1.0')
+            result = obj.schedule_deploy_task(host_id)
+            assert local_mocks['get_deploy_tasks'].called_once_with()
+        assert len(self.mocks['m_requests_Session_send'].call_args_list) == 2
+        assert result == task_id
+
+    def test_get_deploy_tasks(self):
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        resp_obj = dict(
+            count=2,
+            tasks=[
+                dict(host=dict(name='notme')),
+                dict(host=dict(name='name')),
+            ]
+        )
+        self.mocks['m_requests_Session_send']\
+            .return_value.json.return_value = resp_obj
+        result = obj.get_deploy_tasks()
+        assert result[0]['host']['name'] == 'name'
+
+    @mark.parametrize(
+        'active_ids',
+        [
+            [2, 4, 6, 8],
+            [1],
+            [],
+        ]
+    )
+    def test_deploy_task_active(self, active_ids):
+        our_task_id = 4
+        result_objs = [dict(id=task_id) for task_id in active_ids]
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        with patch.multiple(
+            'teuthology.provision.fog.FOG',
+            get_deploy_tasks=DEFAULT,
+        ) as local_mocks:
+            local_mocks['get_deploy_tasks'].return_value = result_objs
+            result = obj.deploy_task_active(our_task_id)
+            assert result is (our_task_id in active_ids)
+
+    @mark.parametrize(
+        'tries',
+        [3, 45],
+    )
+    def test_wait_for_deploy_task(self, tries):
+        wait_results = [True for i in range(tries)] + [False]
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        with patch.multiple(
+            'teuthology.provision.fog.FOG',
+            deploy_task_active=DEFAULT,
+        ) as local_mocks:
+            local_mocks['deploy_task_active'].side_effect = wait_results
+            if tries >= 40:
+                with raises(MaxWhileTries):
+                    obj.wait_for_deploy_task(9)
+                return
+            obj.wait_for_deploy_task(9)
+            assert len(local_mocks['deploy_task_active'].call_args_list) == \
+                tries + 1
+
+    @mark.parametrize(
+        'tries',
+        [1, 51],
+    )
+    def test_wait_for_ready(self, tries):
+        connect_results = [MaxWhileTries for i in range(tries)] + [True]
+        obj = self.klass('name.fqdn', 'type', '1.0')
+        self.mocks['m_Remote_connect'].side_effect = connect_results
+        if tries >= 50:
+            with raises(MaxWhileTries):
+                obj._wait_for_ready()
+            return
+        obj._wait_for_ready()
+        assert len(self.mocks['m_Remote_connect'].call_args_list) == tries + 1