From 5b5f02ecd2c296fd5591a3d00d29f3e8a1bfdb08 Mon Sep 17 00:00:00 2001 From: Zack Cerza Date: Wed, 6 Aug 2014 10:06:34 -0600 Subject: [PATCH] Remove most ceph-specific tasks. They are in ceph-qa-suite now. Signed-off-by: Zack Cerza --- teuthology/task/admin_socket.py | 192 --- teuthology/task/apache.conf.template | 42 - teuthology/task/autotest.py | 166 --- teuthology/task/blktrace.py | 93 -- teuthology/task/calamari/http_client.py | 79 -- teuthology/task/calamari/servertest_1_0.py | 269 ---- teuthology/task/ceph.py | 1277 ----------------- teuthology/task/ceph_client.py | 40 - teuthology/task/ceph_deploy.py | 478 ------- teuthology/task/ceph_fuse.py | 105 -- teuthology/task/ceph_manager.py | 1436 -------------------- teuthology/task/cephfs/__init__.py | 0 teuthology/task/cephfs/filesystem.py | 221 --- teuthology/task/cephfs/fuse_mount.py | 253 ---- teuthology/task/cephfs/kernel_mount.py | 98 -- teuthology/task/cephfs/mount.py | 158 --- teuthology/task/chef.py | 35 - teuthology/task/cifs_mount.py | 137 -- teuthology/task/cram.py | 135 -- teuthology/task/devstack.py | 382 ------ teuthology/task/die_on_err.py | 70 - teuthology/task/divergent_priors.py | 148 -- teuthology/task/dump_stuck.py | 146 -- teuthology/task/ec_lost_unfound.py | 137 -- teuthology/task/filestore_idempotent.py | 81 -- teuthology/task/kclient.py | 71 - teuthology/task/locktest.py | 134 -- teuthology/task/lost_unfound.py | 156 --- teuthology/task/manypools.py | 73 - teuthology/task/mds_client_recovery.py | 352 ----- teuthology/task/mds_creation_failure.py | 85 -- teuthology/task/mds_journal_migration.py | 106 -- teuthology/task/mds_thrash.py | 352 ----- teuthology/task/metadata.yaml | 2 - teuthology/task/mon_clock_skew_check.py | 261 ---- teuthology/task/mon_recovery.py | 80 -- teuthology/task/mon_thrash.py | 343 ----- teuthology/task/multibench.py | 57 - teuthology/task/object_source_down.py | 103 -- teuthology/task/omapbench.py | 83 -- teuthology/task/osd_backfill.py | 105 -- teuthology/task/osd_failsafe_enospc.py | 218 --- teuthology/task/osd_recovery.py | 206 --- teuthology/task/peer.py | 96 -- teuthology/task/peering_speed_test.py | 93 -- teuthology/task/populate_rbd_pool.py | 93 -- teuthology/task/qemu.py | 327 ----- teuthology/task/rados.py | 200 --- teuthology/task/radosbench.py | 95 -- teuthology/task/radosgw_admin.py | 983 -------------- teuthology/task/radosgw_admin_rest.py | 678 --------- teuthology/task/radosgw_agent.py | 211 --- teuthology/task/rbd.py | 512 ------- teuthology/task/rbd_fsx.py | 82 -- teuthology/task/recovery_bench.py | 208 --- teuthology/task/rep_lost_unfound_delete.py | 156 --- teuthology/task/repair_test.py | 297 ---- teuthology/task/rest_api.py | 183 --- teuthology/task/restart.py | 163 --- teuthology/task/rgw.py | 808 ----------- teuthology/task/rgw_logsocket.py | 161 --- teuthology/task/s3readwrite.py | 346 ----- teuthology/task/s3roundtrip.py | 302 ---- teuthology/task/s3tests.py | 402 ------ teuthology/task/samba.py | 243 ---- teuthology/task/scrub.py | 117 -- teuthology/task/scrub_test.py | 199 --- teuthology/task/test/__init__.py | 0 teuthology/task/test/test_devstack.py | 48 - teuthology/task/tgt.py | 177 --- teuthology/task/thrashosds.py | 179 --- teuthology/task/userdata_setup.yaml | 22 - teuthology/task/userdata_teardown.yaml | 11 - teuthology/task/watch_notify_stress.py | 69 - teuthology/task/workunit.py | 372 ----- teuthology/task_util/__init__.py | 0 teuthology/task_util/rados.py | 78 -- teuthology/task_util/rgw.py | 153 --- teuthology/task_util/test/__init__.py | 0 teuthology/task_util/test/test_rados.py | 40 - 80 files changed, 17069 deletions(-) delete mode 100644 teuthology/task/admin_socket.py delete mode 100644 teuthology/task/apache.conf.template delete mode 100644 teuthology/task/autotest.py delete mode 100644 teuthology/task/blktrace.py delete mode 100755 teuthology/task/calamari/http_client.py delete mode 100755 teuthology/task/calamari/servertest_1_0.py delete mode 100644 teuthology/task/ceph.py delete mode 100644 teuthology/task/ceph_client.py delete mode 100644 teuthology/task/ceph_deploy.py delete mode 100644 teuthology/task/ceph_fuse.py delete mode 100644 teuthology/task/ceph_manager.py delete mode 100644 teuthology/task/cephfs/__init__.py delete mode 100644 teuthology/task/cephfs/filesystem.py delete mode 100644 teuthology/task/cephfs/fuse_mount.py delete mode 100644 teuthology/task/cephfs/kernel_mount.py delete mode 100644 teuthology/task/cephfs/mount.py delete mode 100644 teuthology/task/chef.py delete mode 100644 teuthology/task/cifs_mount.py delete mode 100644 teuthology/task/cram.py delete mode 100644 teuthology/task/devstack.py delete mode 100644 teuthology/task/die_on_err.py delete mode 100644 teuthology/task/divergent_priors.py delete mode 100644 teuthology/task/dump_stuck.py delete mode 100644 teuthology/task/ec_lost_unfound.py delete mode 100644 teuthology/task/filestore_idempotent.py delete mode 100644 teuthology/task/kclient.py delete mode 100755 teuthology/task/locktest.py delete mode 100644 teuthology/task/lost_unfound.py delete mode 100644 teuthology/task/manypools.py delete mode 100644 teuthology/task/mds_client_recovery.py delete mode 100644 teuthology/task/mds_creation_failure.py delete mode 100644 teuthology/task/mds_journal_migration.py delete mode 100644 teuthology/task/mds_thrash.py delete mode 100644 teuthology/task/metadata.yaml delete mode 100644 teuthology/task/mon_clock_skew_check.py delete mode 100644 teuthology/task/mon_recovery.py delete mode 100644 teuthology/task/mon_thrash.py delete mode 100644 teuthology/task/multibench.py delete mode 100644 teuthology/task/object_source_down.py delete mode 100644 teuthology/task/omapbench.py delete mode 100644 teuthology/task/osd_backfill.py delete mode 100644 teuthology/task/osd_failsafe_enospc.py delete mode 100644 teuthology/task/osd_recovery.py delete mode 100644 teuthology/task/peer.py delete mode 100644 teuthology/task/peering_speed_test.py delete mode 100644 teuthology/task/populate_rbd_pool.py delete mode 100644 teuthology/task/qemu.py delete mode 100644 teuthology/task/rados.py delete mode 100644 teuthology/task/radosbench.py delete mode 100644 teuthology/task/radosgw_admin.py delete mode 100644 teuthology/task/radosgw_admin_rest.py delete mode 100644 teuthology/task/radosgw_agent.py delete mode 100644 teuthology/task/rbd.py delete mode 100644 teuthology/task/rbd_fsx.py delete mode 100644 teuthology/task/recovery_bench.py delete mode 100644 teuthology/task/rep_lost_unfound_delete.py delete mode 100644 teuthology/task/repair_test.py delete mode 100644 teuthology/task/rest_api.py delete mode 100644 teuthology/task/restart.py delete mode 100644 teuthology/task/rgw.py delete mode 100644 teuthology/task/rgw_logsocket.py delete mode 100644 teuthology/task/s3readwrite.py delete mode 100644 teuthology/task/s3roundtrip.py delete mode 100644 teuthology/task/s3tests.py delete mode 100644 teuthology/task/samba.py delete mode 100644 teuthology/task/scrub.py delete mode 100644 teuthology/task/scrub_test.py delete mode 100644 teuthology/task/test/__init__.py delete mode 100644 teuthology/task/test/test_devstack.py delete mode 100644 teuthology/task/tgt.py delete mode 100644 teuthology/task/thrashosds.py delete mode 100644 teuthology/task/userdata_setup.yaml delete mode 100644 teuthology/task/userdata_teardown.yaml delete mode 100644 teuthology/task/watch_notify_stress.py delete mode 100644 teuthology/task/workunit.py delete mode 100644 teuthology/task_util/__init__.py delete mode 100644 teuthology/task_util/rados.py delete mode 100644 teuthology/task_util/rgw.py delete mode 100644 teuthology/task_util/test/__init__.py delete mode 100644 teuthology/task_util/test/test_rados.py diff --git a/teuthology/task/admin_socket.py b/teuthology/task/admin_socket.py deleted file mode 100644 index 20a670122a..0000000000 --- a/teuthology/task/admin_socket.py +++ /dev/null @@ -1,192 +0,0 @@ -""" -Admin Socket task -- used in rados, powercycle, and smoke testing -""" -from cStringIO import StringIO - -import json -import logging -import os -import time - -from ..orchestra import run -from teuthology import misc as teuthology -from teuthology.parallel import parallel - -log = logging.getLogger(__name__) - - -def task(ctx, config): - """ - Run an admin socket command, make sure the output is json, and run - a test program on it. The test program should read json from - stdin. This task succeeds if the test program exits with status 0. - - To run the same test on all clients:: - - tasks: - - ceph: - - rados: - - admin_socket: - all: - dump_requests: - test: http://example.com/script - - To restrict it to certain clients:: - - tasks: - - ceph: - - rados: [client.1] - - admin_socket: - client.1: - dump_requests: - test: http://example.com/script - - If an admin socket command has arguments, they can be specified as - a list:: - - tasks: - - ceph: - - rados: [client.0] - - admin_socket: - client.0: - dump_requests: - test: http://example.com/script - help: - test: http://example.com/test_help_version - args: [version] - - Note that there must be a ceph client with an admin socket running - before this task is run. The tests are parallelized at the client - level. Tests for a single client are run serially. - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict), \ - 'admin_socket task requires a dict for configuration' - teuthology.replace_all_with_clients(ctx.cluster, config) - - with parallel() as ptask: - for client, tests in config.iteritems(): - ptask.spawn(_run_tests, ctx, client, tests) - - -def _socket_command(ctx, remote, socket_path, command, args): - """ - Run an admin socket command and return the result as a string. - - :param ctx: Context - :param remote: Remote site - :param socket_path: path to socket - :param command: command to be run remotely - :param args: command arguments - - :returns: output of command in json format - """ - json_fp = StringIO() - testdir = teuthology.get_testdir(ctx) - max_tries = 60 - while True: - proc = remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - '--admin-daemon', socket_path, - ] + command.split(' ') + args, - stdout=json_fp, - check_status=False, - ) - if proc.exitstatus == 0: - break - assert max_tries > 0 - max_tries -= 1 - log.info('ceph cli returned an error, command not registered yet?') - log.info('sleeping and retrying ...') - time.sleep(1) - out = json_fp.getvalue() - json_fp.close() - log.debug('admin socket command %s returned %s', command, out) - return json.loads(out) - -def _run_tests(ctx, client, tests): - """ - Create a temp directory and wait for a client socket to be created. - For each test, copy the executable locally and run the test. - Remove temp directory when finished. - - :param ctx: Context - :param client: client machine to run the test - :param tests: list of tests to run - """ - testdir = teuthology.get_testdir(ctx) - log.debug('Running admin socket tests on %s', client) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client) - overrides = ctx.config.get('overrides', {}).get('admin_socket', {}) - - try: - tmp_dir = os.path.join( - testdir, - 'admin_socket_{client}'.format(client=client), - ) - remote.run( - args=[ - 'mkdir', - '--', - tmp_dir, - run.Raw('&&'), - # wait for client process to create the socket - 'while', 'test', '!', '-e', socket_path, run.Raw(';'), - 'do', 'sleep', '1', run.Raw(';'), 'done', - ], - ) - - for command, config in tests.iteritems(): - if config is None: - config = {} - teuthology.deep_merge(config, overrides) - log.debug('Testing %s with config %s', command, str(config)) - - test_path = None - if 'test' in config: - url = config['test'].format( - branch=config.get('branch', 'master') - ) - test_path = os.path.join(tmp_dir, command) - remote.run( - args=[ - 'wget', - '-q', - '-O', - test_path, - '--', - url, - run.Raw('&&'), - 'chmod', - 'u=rx', - '--', - test_path, - ], - ) - - args = config.get('args', []) - assert isinstance(args, list), \ - 'admin socket command args must be a list' - sock_out = _socket_command(ctx, remote, socket_path, command, args) - if test_path is not None: - remote.run( - args=[ - test_path, - ], - stdin=json.dumps(sock_out), - ) - - finally: - remote.run( - args=[ - 'rm', '-rf', '--', tmp_dir, - ], - ) diff --git a/teuthology/task/apache.conf.template b/teuthology/task/apache.conf.template deleted file mode 100644 index c6fc6620e8..0000000000 --- a/teuthology/task/apache.conf.template +++ /dev/null @@ -1,42 +0,0 @@ - - LoadModule env_module {mod_path}/mod_env.so - - - LoadModule rewrite_module {mod_path}/mod_rewrite.so - - - LoadModule fastcgi_module {mod_path}/mod_fastcgi.so - - - LoadModule log_config_module {mod_path}/mod_log_config.so - - -Listen {port} -ServerName {host} - -ServerRoot {testdir}/apache -ErrorLog {testdir}/archive/apache.{client}/error.log -LogFormat "%h l %u %t \"%r\" %>s %b \"{{Referer}}i\" \"%{{User-agent}}i\"" combined -CustomLog {testdir}/archive/apache.{client}/access.log combined -PidFile {testdir}/apache/tmp.{client}/apache.pid -DocumentRoot {testdir}/apache/htdocs.{client} -FastCgiIPCDir {testdir}/apache/tmp.{client}/fastcgi_sock -FastCgiExternalServer {testdir}/apache/htdocs.{client}/rgw.fcgi -socket rgw_sock -idle-timeout {idle_timeout} -RewriteEngine On - -RewriteRule ^/([a-zA-Z0-9-_.]*)([/]?.*) /rgw.fcgi?page=$1¶ms=$2&%{{QUERY_STRING}} [E=HTTP_AUTHORIZATION:%{{HTTP:Authorization}},L] - -# Set fastcgi environment variables. -# Note that this is separate from Unix environment variables! -SetEnv RGW_LOG_LEVEL 20 -SetEnv RGW_SHOULD_LOG yes -SetEnv RGW_PRINT_CONTINUE {print_continue} - - - Options +ExecCGI - AllowOverride All - SetHandler fastcgi-script - - -AllowEncodedSlashes On -ServerSignature Off diff --git a/teuthology/task/autotest.py b/teuthology/task/autotest.py deleted file mode 100644 index 24a7675df2..0000000000 --- a/teuthology/task/autotest.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Run an autotest test on the ceph cluster. -""" -import json -import logging -import os - -from teuthology import misc as teuthology -from teuthology.parallel import parallel -from ..orchestra import run - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run an autotest test on the ceph cluster. - - Only autotest client tests are supported. - - The config is a mapping from role name to list of tests to run on - that client. - - For example:: - - tasks: - - ceph: - - ceph-fuse: [client.0, client.1] - - autotest: - client.0: [dbench] - client.1: [bonnie] - - You can also specify a list of tests to run on all clients:: - - tasks: - - ceph: - - ceph-fuse: - - autotest: - all: [dbench] - """ - assert isinstance(config, dict) - config = teuthology.replace_all_with_clients(ctx.cluster, config) - log.info('Setting up autotest...') - testdir = teuthology.get_testdir(ctx) - with parallel() as p: - for role in config.iterkeys(): - (remote,) = ctx.cluster.only(role).remotes.keys() - p.spawn(_download, testdir, remote) - - log.info('Making a separate scratch dir for every client...') - for role in config.iterkeys(): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - scratch = os.path.join(mnt, 'client.{id}'.format(id=id_)) - remote.run( - args=[ - 'sudo', - 'install', - '-d', - '-m', '0755', - '--owner={user}'.format(user='ubuntu'), #TODO - '--', - scratch, - ], - ) - - with parallel() as p: - for role, tests in config.iteritems(): - (remote,) = ctx.cluster.only(role).remotes.keys() - p.spawn(_run_tests, testdir, remote, role, tests) - -def _download(testdir, remote): - """ - Download. Does not explicitly support muliple tasks in a single run. - """ - remote.run( - args=[ - # explicitly does not support multiple autotest tasks - # in a single run; the result archival would conflict - 'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir), - run.Raw('&&'), - 'mkdir', '{tdir}/autotest'.format(tdir=testdir), - run.Raw('&&'), - 'wget', - '-nv', - '--no-check-certificate', - 'https://github.com/ceph/autotest/tarball/ceph', - '-O-', - run.Raw('|'), - 'tar', - '-C', '{tdir}/autotest'.format(tdir=testdir), - '-x', - '-z', - '-f-', - '--strip-components=1', - ], - ) - -def _run_tests(testdir, remote, role, tests): - """ - Spawned to run test on remote site - """ - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - scratch = os.path.join(mnt, 'client.{id}'.format(id=id_)) - - assert isinstance(tests, list) - for idx, testname in enumerate(tests): - log.info('Running autotest client test #%d: %s...', idx, testname) - - tag = 'client.{id}.num{idx}.{testname}'.format( - idx=idx, - testname=testname, - id=id_, - ) - control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag) - teuthology.write_file( - remote=remote, - path=control, - data='import json; data=json.loads({data!r}); job.run_test(**data)'.format( - data=json.dumps(dict( - url=testname, - dir=scratch, - # TODO perhaps tag - # results will be in {testdir}/autotest/client/results/dbench - # or {testdir}/autotest/client/results/dbench.{tag} - )), - ), - ) - remote.run( - args=[ - '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir), - '--verbose', - '--harness=simple', - '--tag={tag}'.format(tag=tag), - control, - run.Raw('3>&1'), - ], - ) - - remote.run( - args=[ - 'rm', '-rf', '--', control, - ], - ) - - remote.run( - args=[ - 'mv', - '--', - '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag), - '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag), - ], - ) - - remote.run( - args=[ - 'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir), - ], - ) diff --git a/teuthology/task/blktrace.py b/teuthology/task/blktrace.py deleted file mode 100644 index 208bfd533c..0000000000 --- a/teuthology/task/blktrace.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Run blktrace program through teuthology -""" -import contextlib -import logging - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..orchestra import run - -log = logging.getLogger(__name__) -blktrace = '/usr/sbin/blktrace' -daemon_signal = 'term' - -@contextlib.contextmanager -def setup(ctx, config): - """ - Setup all the remotes - """ - osds = ctx.cluster.only(teuthology.is_type('osd')) - log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx)) - - for remote, roles_for_host in osds.remotes.iteritems(): - log.info('Creating %s on %s' % (log_dir, remote.name)) - remote.run( - args=['mkdir', '-p', '-m0755', '--', log_dir], - wait=False, - ) - yield - -@contextlib.contextmanager -def execute(ctx, config): - """ - Run the blktrace program on remote machines. - """ - procs = [] - testdir = teuthology.get_testdir(ctx) - log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir) - - osds = ctx.cluster.only(teuthology.is_type('osd')) - for remote, roles_for_host in osds.remotes.iteritems(): - roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote] - for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): - if roles_to_devs.get(id_): - dev = roles_to_devs[id_] - log.info("running blktrace on %s: %s" % (remote.name, dev)) - - proc = remote.run( - args=[ - 'cd', - log_dir, - run.Raw(';'), - 'daemon-helper', - daemon_signal, - 'sudo', - blktrace, - '-o', - dev.rsplit("/", 1)[1], - '-d', - dev, - ], - wait=False, - stdin=run.PIPE, - ) - procs.append(proc) - try: - yield - finally: - osds = ctx.cluster.only(teuthology.is_type('osd')) - log.info('stopping blktrace processs') - for proc in procs: - proc.stdin.close() - -@contextlib.contextmanager -def task(ctx, config): - """ - Usage: - blktrace: - - Runs blktrace on all clients. - """ - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict.fromkeys(config) - - with contextutil.nested( - lambda: setup(ctx=ctx, config=config), - lambda: execute(ctx=ctx, config=config), - ): - yield - diff --git a/teuthology/task/calamari/http_client.py b/teuthology/task/calamari/http_client.py deleted file mode 100755 index 84a03c7bfa..0000000000 --- a/teuthology/task/calamari/http_client.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python - -import json -import logging -import requests - -log = logging.getLogger(__name__) - - -class AuthenticatedHttpClient(requests.Session): - """ - Client for the calamari REST API, principally exists to do - authentication, but also helpfully prefixes - URLs in requests with the API base URL and JSONizes - POST data. - """ - def __init__(self, api_url, username, password): - super(AuthenticatedHttpClient, self).__init__() - self._username = username - self._password = password - self._api_url = api_url - self.headers = { - 'Content-type': "application/json; charset=UTF-8" - } - - def request(self, method, url, **kwargs): - if not url.startswith('/'): - url = self._api_url + url - response = super(AuthenticatedHttpClient, self).request(method, url, **kwargs) - if response.status_code >= 400: - # For the benefit of test logs - print "%s: %s" % (response.status_code, response.content) - return response - - def post(self, url, data=None, **kwargs): - if isinstance(data, dict): - data = json.dumps(data) - return super(AuthenticatedHttpClient, self).post(url, data, **kwargs) - - def patch(self, url, data=None, **kwargs): - if isinstance(data, dict): - data = json.dumps(data) - return super(AuthenticatedHttpClient, self).patch(url, data, **kwargs) - - def login(self): - """ - Authenticate with the Django auth system as - it is exposed in the Calamari REST API. - """ - log.info("Logging in as %s" % self._username) - response = self.get("auth/login/") - response.raise_for_status() - self.headers['X-XSRF-TOKEN'] = response.cookies['XSRF-TOKEN'] - - self.post("auth/login/", { - 'next': "/", - 'username': self._username, - 'password': self._password - }) - response.raise_for_status() - - # Check we're allowed in now. - response = self.get("cluster") - response.raise_for_status() - -if __name__ == "__main__": - - import argparse - - p = argparse.ArgumentParser() - p.add_argument('-u', '--uri', default='http://mira035/api/v1/') - p.add_argument('--user', default='admin') - p.add_argument('--pass', dest='password', default='admin') - args, remainder = p.parse_known_args() - - c = AuthenticatedHttpClient(args.uri, args.user, args.password) - c.login() - response = c.request('GET', ''.join(remainder)).json() - print json.dumps(response, indent=2) diff --git a/teuthology/task/calamari/servertest_1_0.py b/teuthology/task/calamari/servertest_1_0.py deleted file mode 100755 index b9b07a3905..0000000000 --- a/teuthology/task/calamari/servertest_1_0.py +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env python - -import datetime -import os -import logging -import logging.handlers -import requests -import uuid -import unittest -from http_client import AuthenticatedHttpClient - -log = logging.getLogger(__name__) -log.addHandler(logging.StreamHandler()) -log.setLevel(logging.INFO) - -global base_uri -global client -base_uri = None -server_uri = None -client = None - -def setUpModule(): - global base_uri - global server_uri - global client - try: - base_uri = os.environ['CALAMARI_BASE_URI'] - except KeyError: - log.error('Must define CALAMARI_BASE_URI') - os._exit(1) - if not base_uri.endswith('/'): - base_uri += '/' - if not base_uri.endswith('api/v1/'): - base_uri += 'api/v1/' - client = AuthenticatedHttpClient(base_uri, 'admin', 'admin') - server_uri = base_uri.replace('api/v1/', '') - client.login() - -class RestTest(unittest.TestCase): - 'Base class for all tests here; get class\'s data' - - def setUp(self): - # Called once for each test_* case. A bit wasteful, but we - # really like using the simple class variable self.uri - # to customize each derived TestCase - method = getattr(self, 'method', 'GET') - raw = self.uri.startswith('/') - self.response = self.get_object(method, self.uri, raw=raw) - - def get_object(self, method, url, raw=False): - global server_uri - 'Return Python object decoded from JSON response to method/url' - if not raw: - return client.request(method, url).json() - else: - return requests.request(method, server_uri + url).json() - -class TestUserMe(RestTest): - - uri = 'user/me' - - def test_me(self): - self.assertEqual(self.response['username'], 'admin') - -class TestCluster(RestTest): - - uri = 'cluster' - - def test_id(self): - self.assertEqual(self.response[0]['id'], 1) - - def test_times(self): - for time in ( - self.response[0]['cluster_update_time'], - self.response[0]['cluster_update_attempt_time'], - ): - self.assertTrue(is_datetime(time)) - - def test_api_base_url(self): - api_base_url = self.response[0]['api_base_url'] - self.assertTrue(api_base_url.startswith('http')) - self.assertIn('api/v0.1', api_base_url) - -class TestHealth(RestTest): - - uri = 'cluster/1/health' - - def test_cluster(self): - self.assertEqual(self.response['cluster'], 1) - - def test_times(self): - for time in ( - self.response['cluster_update_time'], - self.response['added'], - ): - self.assertTrue(is_datetime(time)) - - def test_report_and_overall_status(self): - self.assertIn('report', self.response) - self.assertIn('overall_status', self.response['report']) - -class TestHealthCounters(RestTest): - - uri = 'cluster/1/health_counters' - - def test_cluster(self): - self.assertEqual(self.response['cluster'], 1) - - def test_time(self): - self.assertTrue(is_datetime(self.response['cluster_update_time'])) - - def test_existence(self): - for section in ('pg', 'mon', 'osd'): - for counter in ('warn', 'critical', 'ok'): - count = self.response[section][counter]['count'] - self.assertIsInstance(count, int) - self.assertIsInstance(self.response['pool']['total'], int) - - def test_mds_sum(self): - count = self.response['mds'] - self.assertEqual( - count['up_not_in'] + count['not_up_not_in'] + count['up_in'], - count['total'] - ) - -class TestSpace(RestTest): - - uri = 'cluster/1/space' - - def test_cluster(self): - self.assertEqual(self.response['cluster'], 1) - - def test_times(self): - for time in ( - self.response['cluster_update_time'], - self.response['added'], - ): - self.assertTrue(is_datetime(time)) - - def test_space(self): - for size in ('free_bytes', 'used_bytes', 'capacity_bytes'): - self.assertIsInstance(self.response['space'][size], int) - self.assertGreater(self.response['space'][size], 0) - - def test_report(self): - for size in ('total_used', 'total_space', 'total_avail'): - self.assertIsInstance(self.response['report'][size], int) - self.assertGreater(self.response['report'][size], 0) - -class TestOSD(RestTest): - - uri = 'cluster/1/osd' - - def test_cluster(self): - self.assertEqual(self.response['cluster'], 1) - - def test_times(self): - for time in ( - self.response['cluster_update_time'], - self.response['added'], - ): - self.assertTrue(is_datetime(time)) - - def test_osd_uuid(self): - for osd in self.response['osds']: - uuidobj = uuid.UUID(osd['uuid']) - self.assertEqual(str(uuidobj), osd['uuid']) - - def test_osd_pools(self): - for osd in self.response['osds']: - if osd['up'] != 1: - continue - self.assertIsInstance(osd['pools'], list) - self.assertIsInstance(osd['pools'][0], basestring) - - def test_osd_up_in(self): - for osd in self.response['osds']: - for flag in ('up', 'in'): - self.assertIn(osd[flag], (0, 1)) - - def test_osd_0(self): - osd0 = self.get_object('GET', 'cluster/1/osd/0')['osd'] - for field in osd0.keys(): - if not field.startswith('cluster_update_time'): - self.assertEqual(self.response['osds'][0][field], osd0[field]) - -class TestPool(RestTest): - - uri = 'cluster/1/pool' - - def test_cluster(self): - for pool in self.response: - self.assertEqual(pool['cluster'], 1) - - def test_fields_are_ints(self): - for pool in self.response: - for field in ('id', 'used_objects', 'used_bytes'): - self.assertIsInstance(pool[field], int) - - def test_name_is_str(self): - for pool in self.response: - self.assertIsInstance(pool['name'], basestring) - - def test_pool_0(self): - poolid = self.response[0]['id'] - pool = self.get_object('GET', 'cluster/1/pool/{id}'.format(id=poolid)) - self.assertEqual(self.response[0], pool) - -class TestServer(RestTest): - - uri = 'cluster/1/server' - - def test_ipaddr(self): - for server in self.response: - octets = server['addr'].split('.') - self.assertEqual(len(octets), 4) - for octetstr in octets: - octet = int(octetstr) - self.assertIsInstance(octet, int) - self.assertGreaterEqual(octet, 0) - self.assertLessEqual(octet, 255) - - def test_hostname_name_strings(self): - for server in self.response: - for field in ('name', 'hostname'): - self.assertIsInstance(server[field], basestring) - - def test_services(self): - for server in self.response: - self.assertIsInstance(server['services'], list) - for service in server['services']: - self.assertIn(service['type'], ('osd', 'mon', 'mds')) - -class TestGraphitePoolIOPS(RestTest): - - uri = '/graphite/render?format=json-array&' \ - 'target=ceph.cluster.ceph.pool.0.num_read&' \ - 'target=ceph.cluster.ceph.pool.0.num_write' - - def test_targets_contain_request(self): - self.assertIn('targets', self.response) - self.assertIn('ceph.cluster.ceph.pool.0.num_read', - self.response['targets']) - self.assertIn('ceph.cluster.ceph.pool.0.num_write', - self.response['targets']) - - def test_datapoints(self): - self.assertIn('datapoints', self.response) - self.assertGreater(len(self.response['datapoints']), 0) - data = self.response['datapoints'][0] - self.assertEqual(len(data), 3) - self.assertIsInstance(data[0], int) - if data[1]: - self.assertIsInstance(data[1], float) - if data[2]: - self.assertIsInstance(data[2], float) - -# -# Utility functions -# - -DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ' - -def is_datetime(time): - datetime.datetime.strptime(time, DATETIME_FORMAT) - return True - -if __name__ == '__main__': - unittest.main() diff --git a/teuthology/task/ceph.py b/teuthology/task/ceph.py deleted file mode 100644 index 458f86f484..0000000000 --- a/teuthology/task/ceph.py +++ /dev/null @@ -1,1277 +0,0 @@ -""" -Ceph cluster task. - -Handle the setup, starting, and clean-up of a Ceph cluster. -""" -from cStringIO import StringIO - -import argparse -import contextlib -import logging -import os -import json -import time - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..orchestra import run -import ceph_client as cclient -from teuthology.orchestra.run import CommandFailedError -from teuthology.orchestra.daemon import DaemonGroup - -DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf' - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def ceph_log(ctx, config): - """ - Create /var/log/ceph log directory that is open to everyone. - Add valgrind and profiling-logger directories. - - :param ctx: Context - :param config: Configuration - """ - log.info('Making ceph log dir writeable by non-root...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'chmod', - '777', - '/var/log/ceph', - ], - wait=False, - ) - ) - log.info('Disabling ceph logrotate...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'rm', '-f', '--', - '/etc/logrotate.d/ceph', - ], - wait=False, - ) - ) - log.info('Creating extra log directories...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'install', '-d', '-m0755', '--', - '/var/log/ceph/valgrind', - '/var/log/ceph/profiling-logger', - ], - wait=False, - ) - ) - - try: - yield - - finally: - pass - - -def assign_devs(roles, devs): - """ - Create a dictionary of devs indexed by roles - - :param roles: List of roles - :param devs: Corresponding list of devices. - :returns: Dictionary of devs indexed by roles. - """ - return dict(zip(roles, devs)) - -@contextlib.contextmanager -def valgrind_post(ctx, config): - """ - After the tests run, look throught all the valgrind logs. Exceptions are raised - if textual errors occured in the logs, or if valgrind exceptions were detected in - the logs. - - :param ctx: Context - :param config: Configuration - """ - try: - yield - finally: - lookup_procs = list() - log.info('Checking for errors in any valgrind logs...'); - for remote in ctx.cluster.remotes.iterkeys(): - #look at valgrind logs for each node - proc = remote.run( - args=[ - 'sudo', - 'zgrep', - '', - run.Raw('/var/log/ceph/valgrind/*'), - '/dev/null', # include a second file so that we always get a filename prefix on the output - run.Raw('|'), - 'sort', - run.Raw('|'), - 'uniq', - ], - wait=False, - check_status=False, - stdout=StringIO(), - ) - lookup_procs.append((proc, remote)) - - valgrind_exception = None - for (proc, remote) in lookup_procs: - proc.wait() - out = proc.stdout.getvalue() - for line in out.split('\n'): - if line == '': - continue - try: - (file, kind) = line.split(':') - except Exception: - log.error('failed to split line %s', line) - raise - log.debug('file %s kind %s', file, kind) - if (file.find('mds') >= 0) and kind.find('Lost') > 0: - continue - log.error('saw valgrind issue %s in %s', kind, file) - valgrind_exception = Exception('saw valgrind issues') - - if valgrind_exception is not None: - raise valgrind_exception - - -def mount_osd_data(ctx, remote, osd): - """ - Mount a remote OSD - - :param ctx: Context - :param remote: Remote site - :param ods: Osd name - """ - log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote)) - if remote in ctx.disk_config.remote_to_roles_to_dev and osd in ctx.disk_config.remote_to_roles_to_dev[remote]: - dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd] - mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd] - fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd] - mnt = os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=osd)) - - log.info('Mounting osd.{o}: dev: {n}, mountpoint: {p}, type: {t}, options: {v}'.format( - o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options)) - - remote.run( - args=[ - 'sudo', - 'mount', - '-t', fstype, - '-o', ','.join(mount_options), - dev, - mnt, - ] - ) - -def make_admin_daemon_dir(ctx, remote): - """ - Create /var/run/ceph directory on remote site. - - :param ctx: Context - :param remote: Remote site - """ - remote.run( - args=[ - 'sudo', - 'install', '-d', '-m0777', '--', '/var/run/ceph', - ], - ) - - -def write_conf(ctx, conf_path=DEFAULT_CONF_PATH): - conf_fp = StringIO() - ctx.ceph.conf.write(conf_fp) - conf_fp.seek(0) - writes = ctx.cluster.run( - args=[ - 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'), - 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'), - 'sudo', 'python', - '-c', - 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', - conf_path, - run.Raw('&&'), - 'sudo', 'chmod', '0644', conf_path, - ], - stdin=run.PIPE, - wait=False) - log.warn("writes: ") - teuthology.feed_many_stdins_and_close(conf_fp, writes) - run.wait(writes) - - -@contextlib.contextmanager -def cephfs_setup(ctx, config): - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - first_mon = teuthology.get_first_mon(ctx, config) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - mdss = ctx.cluster.only(teuthology.is_type('mds')) - # If there are any MDSs, then create a filesystem for them to use - # Do this last because requires mon cluster to be up and running - if mdss.remotes: - log.info('Setting up CephFS filesystem...') - - try: - proc = mon_remote.run(args=['sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools'], - stdout=StringIO()) - pools = json.loads(proc.stdout.getvalue()) - metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools] - except CommandFailedError as e: - # For use in upgrade tests, Ceph cuttlefish and earlier don't support - # structured output (--format) from the CLI. - if e.exitstatus == 22: - metadata_pool_exists = True - else: - raise - - # In case we are using an older Ceph which creates FS by default - if metadata_pool_exists: - log.info("Metadata pool already exists, skipping") - else: - mon_remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create', 'metadata', '256']) - mon_remote.run(args=['sudo', 'ceph', 'osd', 'pool', 'create', 'data', '256']) - - # Use 'newfs' to work with either old or new Ceph, until the 'fs new' - # stuff is all landed. - mon_remote.run(args=['sudo', 'ceph', 'mds', 'newfs', '1', '2']) - # mon_remote.run(args=['sudo', 'ceph', 'fs', 'new', 'default', 'metadata', 'data']) - - is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1 - all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles] - num_active = len([r for r in all_roles if is_active_mds(r)]) - mon_remote.run(args=[ - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph', - 'mds', 'set_max_mds', str(num_active)]) - - yield - - -@contextlib.contextmanager -def cluster(ctx, config): - """ - Handle the creation and removal of a ceph cluster. - - On startup: - Create directories needed for the cluster. - Create remote journals for all osds. - Create and set keyring. - Copy the monmap to tht test systems. - Setup mon nodes. - Setup mds nodes. - Mkfs osd nodes. - Add keyring information to monmaps - Mkfs mon nodes. - - On exit: - If errors occured, extract a failure message and store in ctx.summary. - Unmount all test files and temporary journaling files. - Save the monitor information and archive all ceph logs. - Cleanup the keyring setup, and remove all monitor map and data files left over. - - :param ctx: Context - :param config: Configuration - """ - if ctx.config.get('use_existing_cluster', False) is True: - log.info("'use_existing_cluster' is true; skipping cluster creation") - yield - - testdir = teuthology.get_testdir(ctx) - log.info('Creating ceph cluster...') - run.wait( - ctx.cluster.run( - args=[ - 'install', '-d', '-m0755', '--', - '{tdir}/data'.format(tdir=testdir), - ], - wait=False, - ) - ) - - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'install', '-d', '-m0777', '--', '/var/run/ceph', - ], - wait=False, - ) - ) - - - devs_to_clean = {} - remote_to_roles_to_devs = {} - remote_to_roles_to_journals = {} - osds = ctx.cluster.only(teuthology.is_type('osd')) - for remote, roles_for_host in osds.remotes.iteritems(): - devs = teuthology.get_scratch_devices(remote) - roles_to_devs = {} - roles_to_journals = {} - if config.get('fs'): - log.info('fs option selected, checking for scratch devs') - log.info('found devs: %s' % (str(devs),)) - devs_id_map = teuthology.get_wwn_id_map(remote, devs) - iddevs = devs_id_map.values() - roles_to_devs = assign_devs( - teuthology.roles_of_type(roles_for_host, 'osd'), iddevs - ) - if len(roles_to_devs) < len(iddevs): - iddevs = iddevs[len(roles_to_devs):] - devs_to_clean[remote] = [] - - if config.get('block_journal'): - log.info('block journal enabled') - roles_to_journals = assign_devs( - teuthology.roles_of_type(roles_for_host, 'osd'), iddevs - ) - log.info('journal map: %s', roles_to_journals) - - if config.get('tmpfs_journal'): - log.info('tmpfs journal enabled') - roles_to_journals = {} - remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) - for osd in teuthology.roles_of_type(roles_for_host, 'osd'): - tmpfs = '/mnt/osd.%s' % osd - roles_to_journals[osd] = tmpfs - remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) - log.info('journal map: %s', roles_to_journals) - - log.info('dev map: %s' % (str(roles_to_devs),)) - remote_to_roles_to_devs[remote] = roles_to_devs - remote_to_roles_to_journals[remote] = roles_to_journals - - - log.info('Generating config...') - remotes_and_roles = ctx.cluster.remotes.items() - roles = [role_list for (remote, role_list) in remotes_and_roles] - ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] - conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) - for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): - for role, journal in roles_to_journals.iteritems(): - key = "osd." + str(role) - if key not in conf: - conf[key] = {} - conf[key]['osd journal'] = journal - for section, keys in config['conf'].iteritems(): - for key, value in keys.iteritems(): - log.info("[%s] %s = %s" % (section, key, value)) - if section not in conf: - conf[section] = {} - conf[section][key] = value - - if config.get('tmpfs_journal'): - conf['journal dio'] = False - - ctx.ceph = argparse.Namespace() - ctx.ceph.conf = conf - - keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') - - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - firstmon = teuthology.get_first_mon(ctx, config) - - log.info('Setting up %s...' % firstmon) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - keyring_path, - ], - ) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--gen-key', - '--name=mon.', - keyring_path, - ], - ) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'chmod', - '0644', - keyring_path, - ], - ) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - fsid = teuthology.create_simple_monmap( - ctx, - remote=mon0_remote, - conf=conf, - ) - if not 'global' in conf: - conf['global'] = {} - conf['global']['fsid'] = fsid - - log.info('Writing ceph.conf for FSID %s...' % fsid) - conf_path = config.get('conf_path', DEFAULT_CONF_PATH) - write_conf(ctx, conf_path) - - log.info('Creating admin key on %s...' % firstmon) - ctx.cluster.only(firstmon).run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--gen-key', - '--name=client.admin', - '--set-uid=0', - '--cap', 'mon', 'allow *', - '--cap', 'osd', 'allow *', - '--cap', 'mds', 'allow', - keyring_path, - ], - ) - - log.info('Copying monmap to all nodes...') - keyring = teuthology.get_file( - remote=mon0_remote, - path=keyring_path, - ) - monmap = teuthology.get_file( - remote=mon0_remote, - path='{tdir}/monmap'.format(tdir=testdir), - ) - - for rem in ctx.cluster.remotes.iterkeys(): - # copy mon key and initial monmap - log.info('Sending monmap to node {remote}'.format(remote=rem)) - teuthology.sudo_write_file( - remote=rem, - path=keyring_path, - data=keyring, - perms='0644' - ) - teuthology.write_file( - remote=rem, - path='{tdir}/monmap'.format(tdir=testdir), - data=monmap, - ) - - log.info('Setting up mon nodes...') - mons = ctx.cluster.only(teuthology.is_type('mon')) - run.wait( - mons.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'osdmaptool', - '-c', conf_path, - '--clobber', - '--createsimple', '{num:d}'.format( - num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), - ), - '{tdir}/osdmap'.format(tdir=testdir), - '--pg_bits', '2', - '--pgp_bits', '4', - ], - wait=False, - ), - ) - - log.info('Setting up mds nodes...') - mdss = ctx.cluster.only(teuthology.is_type('mds')) - for remote, roles_for_host in mdss.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), - run.Raw('&&'), - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - '--name=mds.{id}'.format(id=id_), - '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), - ], - ) - - cclient.create_keyring(ctx) - log.info('Running mkfs on osd nodes...') - - ctx.disk_config = argparse.Namespace() - ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs - ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals - ctx.disk_config.remote_to_roles_to_dev_mount_options = {} - ctx.disk_config.remote_to_roles_to_dev_fstype = {} - - log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) - for remote, roles_for_host in osds.remotes.iteritems(): - roles_to_devs = remote_to_roles_to_devs[remote] - roles_to_journals = remote_to_roles_to_journals[remote] - - - for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), - ]) - log.info(str(roles_to_journals)) - log.info(id_) - if roles_to_devs.get(id_): - dev = roles_to_devs[id_] - fs = config.get('fs') - package = None - mkfs_options = config.get('mkfs_options') - mount_options = config.get('mount_options') - if fs == 'btrfs': - #package = 'btrfs-tools' - if mount_options is None: - mount_options = ['noatime','user_subvol_rm_allowed'] - if mkfs_options is None: - mkfs_options = ['-m', 'single', - '-l', '32768', - '-n', '32768'] - if fs == 'xfs': - #package = 'xfsprogs' - if mount_options is None: - mount_options = ['noatime'] - if mkfs_options is None: - mkfs_options = ['-f', '-i', 'size=2048'] - if fs == 'ext4' or fs == 'ext3': - if mount_options is None: - mount_options = ['noatime','user_xattr'] - - if mount_options is None: - mount_options = [] - if mkfs_options is None: - mkfs_options = [] - mkfs = ['mkfs.%s' % fs] + mkfs_options - log.info('%s on %s on %s' % (mkfs, dev, remote)) - if package is not None: - remote.run( - args=[ - 'sudo', - 'apt-get', 'install', '-y', package - ], - stdout=StringIO(), - ) - - try: - remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) - except run.CommandFailedError: - # Newer btfs-tools doesn't prompt for overwrite, use -f - if '-f' not in mount_options: - mkfs_options.append('-f') - mkfs = ['mkfs.%s' % fs] + mkfs_options - log.info('%s on %s on %s' % (mkfs, dev, remote)) - remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) - - log.info('mount %s on %s -o %s' % (dev, remote, - ','.join(mount_options))) - remote.run( - args=[ - 'sudo', - 'mount', - '-t', fs, - '-o', ','.join(mount_options), - dev, - os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), - ] - ) - if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: - ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} - ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options - if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: - ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} - ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs - devs_to_clean[remote].append( - os.path.join( - os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), - ) - ) - - for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): - remote.run( - args=[ - 'sudo', - 'MALLOC_CHECK_=3', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-osd', - '--mkfs', - '--mkkey', - '-i', id_, - '--monmap', '{tdir}/monmap'.format(tdir=testdir), - ], - ) - - - log.info('Reading keys from all nodes...') - keys_fp = StringIO() - keys = [] - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for type_ in ['mds','osd']: - for id_ in teuthology.roles_of_type(roles_for_host, type_): - data = teuthology.get_file( - remote=remote, - path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( - type=type_, - id=id_, - ), - sudo=True, - ) - keys.append((type_, id_, data)) - keys_fp.write(data) - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for type_ in ['client']: - for id_ in teuthology.roles_of_type(roles_for_host, type_): - data = teuthology.get_file( - remote=remote, - path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) - ) - keys.append((type_, id_, data)) - keys_fp.write(data) - - log.info('Adding keys to all mons...') - writes = mons.run( - args=[ - 'sudo', 'tee', '-a', - keyring_path, - ], - stdin=run.PIPE, - wait=False, - stdout=StringIO(), - ) - keys_fp.seek(0) - teuthology.feed_many_stdins_and_close(keys_fp, writes) - run.wait(writes) - for type_, id_, data in keys: - run.wait( - mons.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - keyring_path, - '--name={type}.{id}'.format( - type=type_, - id=id_, - ), - ] + list(teuthology.generate_caps(type_)), - wait=False, - ), - ) - - log.info('Running mkfs on mon nodes...') - for remote, roles_for_host in mons.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): - remote.run( - args=[ - 'sudo', - 'mkdir', - '-p', - '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), - ], - ) - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-mon', - '--mkfs', - '-i', id_, - '--monmap={tdir}/monmap'.format(tdir=testdir), - '--osdmap={tdir}/osdmap'.format(tdir=testdir), - '--keyring={kpath}'.format(kpath=keyring_path), - ], - ) - - - run.wait( - mons.run( - args=[ - 'rm', - '--', - '{tdir}/monmap'.format(tdir=testdir), - '{tdir}/osdmap'.format(tdir=testdir), - ], - wait=False, - ), - ) - - try: - yield - except Exception: - # we need to know this below - ctx.summary['success'] = False - raise - finally: - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - - log.info('Checking cluster log for badness...') - def first_in_ceph_log(pattern, excludes): - """ - Find the first occurence of the pattern specified in the Ceph log, - Returns None if none found. - - :param pattern: Pattern scanned for. - :param excludes: Patterns to ignore. - :return: First line of text (or None if not found) - """ - args = [ - 'sudo', - 'egrep', pattern, - '/var/log/ceph/ceph.log', - ] - for exclude in excludes: - args.extend([run.Raw('|'), 'egrep', '-v', exclude]) - args.extend([ - run.Raw('|'), 'head', '-n', '1', - ]) - r = mon0_remote.run( - stdout=StringIO(), - args=args, - ) - stdout = r.stdout.getvalue() - if stdout != '': - return stdout - return None - - if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', - config['log_whitelist']) is not None: - log.warning('Found errors (ERR|WRN|SEC) in cluster log') - ctx.summary['success'] = False - # use the most severe problem as the failure reason - if 'failure_reason' not in ctx.summary: - for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: - match = first_in_ceph_log(pattern, config['log_whitelist']) - if match is not None: - ctx.summary['failure_reason'] = \ - '"{match}" in cluster log'.format( - match=match.rstrip('\n'), - ) - break - - for remote, dirs in devs_to_clean.iteritems(): - for dir_ in dirs: - log.info('Unmounting %s on %s' % (dir_, remote)) - remote.run( - args=[ - 'sync', - run.Raw('&&'), - 'sudo', - 'umount', - '-f', - dir_ - ] - ) - - if config.get('tmpfs_journal'): - log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') - for remote, roles_for_host in osds.remotes.iteritems(): - remote.run( - args=[ 'sudo', 'umount', '-f', '/mnt' ], - check_status=False, - ) - - if ctx.archive is not None and \ - not (ctx.config.get('archive-on-error') and ctx.summary['success']): - # archive mon data, too - log.info('Archiving mon data...') - path = os.path.join(ctx.archive, 'data') - os.makedirs(path) - for remote, roles in mons.remotes.iteritems(): - for role in roles: - if role.startswith('mon.'): - teuthology.pull_directory_tarball( - remote, - '/var/lib/ceph/mon', - path + '/' + role + '.tgz') - - # and logs - log.info('Compressing logs...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'find', - '/var/log/ceph', - '-name', - '*.log', - '-print0', - run.Raw('|'), - 'sudo', - 'xargs', - '-0', - '--no-run-if-empty', - '--', - 'gzip', - '--', - ], - wait=False, - ), - ) - - log.info('Archiving logs...') - path = os.path.join(ctx.archive, 'remote') - os.makedirs(path) - for remote in ctx.cluster.remotes.iterkeys(): - sub = os.path.join(path, remote.shortname) - os.makedirs(sub) - teuthology.pull_directory(remote, '/var/log/ceph', - os.path.join(sub, 'log')) - - - log.info('Cleaning ceph cluster...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'rm', - '-rf', - '--', - conf_path, - keyring_path, - '{tdir}/data'.format(tdir=testdir), - '{tdir}/monmap'.format(tdir=testdir), - ], - wait=False, - ), - ) - -def get_all_pg_info(rem_site, testdir): - """ - Get the results of a ceph pg dump - """ - info = rem_site.run(args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', 'pg', 'dump', - '--format', 'json'], stdout=StringIO()) - all_info = json.loads(info.stdout.getvalue()) - return all_info['pg_stats'] - -def osd_scrub_pgs(ctx, config): - """ - Scrub pgs when we exit. - - First make sure all pgs are active and clean. - Next scrub all osds. - Then periodically check until all pgs have scrub time stamps that - indicate the last scrub completed. Time out if no progess is made - here after two minutes. - """ - retries = 12 - delays = 10 - vlist = ctx.cluster.remotes.values() - testdir = teuthology.get_testdir(ctx) - rem_site = ctx.cluster.remotes.keys()[0] - all_clean = False - for _ in range(0, retries): - stats = get_all_pg_info(rem_site, testdir) - states = [stat['state'] for stat in stats] - if len(set(states)) == 1 and states[0] == 'active+clean': - all_clean = True - break - log.info("Waiting for all osds to be active and clean.") - time.sleep(delays) - if not all_clean: - log.info("Scrubbing terminated -- not all pgs were active and clean.") - return - check_time_now = time.localtime() - time.sleep(1) - for slists in vlist: - for role in slists: - if role.startswith('osd.'): - log.info("Scrubbing osd {osd}".format(osd=role)) - rem_site.run(args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', 'osd', 'deep-scrub', role]) - prev_good = 0 - gap_cnt = 0 - loop = True - while loop: - stats = get_all_pg_info(rem_site, testdir) - timez = [stat['last_scrub_stamp'] for stat in stats] - loop = False - thiscnt = 0 - for tmval in timez: - pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S') - if pgtm > check_time_now: - thiscnt += 1 - else: - loop = True - if thiscnt > prev_good: - prev_good = thiscnt - gap_cnt = 0 - else: - gap_cnt += 1 - if gap_cnt > retries: - log.info('Exiting scrub checking -- not all pgs scrubbed.') - return - if loop: - log.info('Still waiting for all pgs to be scrubbed.') - time.sleep(delays) - -@contextlib.contextmanager -def run_daemon(ctx, config, type_): - """ - Run daemons for a role type. Handle the startup and termination of a a daemon. - On startup -- set coverages, cpu_profile, valgrind values for all remotes, - and a max_mds value for one mds. - On cleanup -- Stop all existing daemons of this type. - - :param ctx: Context - :param config: Configuration - :paran type_: Role type - """ - log.info('Starting %s daemons...' % type_) - testdir = teuthology.get_testdir(ctx) - daemons = ctx.cluster.only(teuthology.is_type(type_)) - - # check whether any daemons if this type are configured - if daemons is None: - return - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - - daemon_signal = 'kill' - if config.get('coverage') or config.get('valgrind') is not None: - daemon_signal = 'term' - - for remote, roles_for_host in daemons.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, type_): - name = '%s.%s' % (type_, id_) - - run_cmd = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'daemon-helper', - daemon_signal, - ] - run_cmd_tail = [ - 'ceph-%s' % (type_), - '-f', - '-i', id_] - - if type_ in config.get('cpu_profile', []): - profile_path = '/var/log/ceph/profiling-logger/%s.%s.prof' % (type_, id_) - run_cmd.extend([ 'env', 'CPUPROFILE=%s' % profile_path ]) - - if config.get('valgrind') is not None: - valgrind_args = None - if type_ in config['valgrind']: - valgrind_args = config['valgrind'][type_] - if name in config['valgrind']: - valgrind_args = config['valgrind'][name] - run_cmd = teuthology.get_valgrind_args(testdir, name, - run_cmd, - valgrind_args) - - run_cmd.extend(run_cmd_tail) - - ctx.daemons.add_daemon(remote, type_, id_, - args=run_cmd, - logger=log.getChild(name), - stdin=run.PIPE, - wait=False, - ) - - try: - yield - finally: - teuthology.stop_daemons_of_type(ctx, type_) - -def healthy(ctx, config): - """ - Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. - - :param ctx: Context - :param config: Configuration - """ - log.info('Waiting until ceph is healthy...') - firstmon = teuthology.get_first_mon(ctx, config) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - teuthology.wait_until_osds_up( - ctx, - cluster=ctx.cluster, - remote=mon0_remote - ) - teuthology.wait_until_healthy( - ctx, - remote=mon0_remote, - ) - -def wait_for_osds_up(ctx, config): - """ - Wait for all osd's to come up. - - :param ctx: Context - :param config: Configuration - """ - log.info('Waiting until ceph osds are all up...') - firstmon = teuthology.get_first_mon(ctx, config) - (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() - teuthology.wait_until_osds_up( - ctx, - cluster=ctx.cluster, - remote=mon0_remote - ) - -def wait_for_mon_quorum(ctx, config): - """ - Check renote ceph status until all monitors are up. - - :param ctx: Context - :param config: Configuration - """ - - assert isinstance(config, list) - firstmon = teuthology.get_first_mon(ctx, config) - (remote,) = ctx.cluster.only(firstmon).remotes.keys() - while True: - r = remote.run( - args=[ - 'ceph', - 'quorum_status', - ], - stdout=StringIO(), - logger=log.getChild('quorum_status'), - ) - j = json.loads(r.stdout.getvalue()) - q = j.get('quorum_names', []) - log.debug('Quorum: %s', q) - if sorted(q) == sorted(config): - break - time.sleep(1) - - -@contextlib.contextmanager -def restart(ctx, config): - """ - restart ceph daemons - - For example:: - tasks: - - ceph.restart: [all] - - For example:: - tasks: - - ceph.restart: [osd.0, mon.1] - - or:: - - tasks: - - ceph.restart: - daemons: [osd.0, mon.1] - wait-for-healthy: false - wait-for-osds-up: true - - :param ctx: Context - :param config: Configuration - """ - if config is None: - config = {} - if isinstance(config, list): - config = { 'daemons': config } - if 'daemons' not in config: - config['daemons'] = [] - type_daemon = ['mon', 'osd', 'mds', 'rgw'] - for d in type_daemon: - type_ = d - for daemon in ctx.daemons.iter_daemons_of_role(type_): - config['daemons'].append(type_ + '.' + daemon.id_) - - assert isinstance(config['daemons'], list) - daemons = dict.fromkeys(config['daemons']) - for i in daemons.keys(): - type_ = i.split('.', 1)[0] - id_ = i.split('.', 1)[1] - ctx.daemons.get_daemon(type_, id_).stop() - ctx.daemons.get_daemon(type_, id_).restart() - - if config.get('wait-for-healthy', True): - healthy(ctx=ctx, config=None) - if config.get('wait-for-osds-up', False): - wait_for_osds_up(ctx=ctx, config=None) - yield - -@contextlib.contextmanager -def task(ctx, config): - """ - Set up and tear down a Ceph cluster. - - For example:: - - tasks: - - ceph: - - interactive: - - You can also specify what branch to run:: - - tasks: - - ceph: - branch: foo - - Or a tag:: - - tasks: - - ceph: - tag: v0.42.13 - - Or a sha1:: - - tasks: - - ceph: - sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed - - Or a local source dir:: - - tasks: - - ceph: - path: /home/sage/ceph - - To capture code coverage data, use:: - - tasks: - - ceph: - coverage: true - - To use btrfs, ext4, or xfs on the target's scratch disks, use:: - - tasks: - - ceph: - fs: xfs - mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1] - mount_options: [nobarrier, inode64] - - Note, this will cause the task to check the /scratch_devs file on each node - for available devices. If no such file is found, /dev/sdb will be used. - - To run some daemons under valgrind, include their names - and the tool/args to use in a valgrind section:: - - tasks: - - ceph: - valgrind: - mds.1: --tool=memcheck - osd.1: [--tool=memcheck, --leak-check=no] - - Those nodes which are using memcheck or valgrind will get - checked for bad results. - - To adjust or modify config options, use:: - - tasks: - - ceph: - conf: - section: - key: value - - For example:: - - tasks: - - ceph: - conf: - mds.0: - some option: value - other key: other value - client.0: - debug client: 10 - debug ms: 1 - - By default, the cluster log is checked for errors and warnings, - and the run marked failed if any appear. You can ignore log - entries by giving a list of egrep compatible regexes, i.e.: - - tasks: - - ceph: - log-whitelist: ['foo.*bar', 'bad message'] - - :param ctx: Context - :param config: Configuration - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - "task ceph only supports a dictionary for configuration" - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph', {})) - - ctx.daemons = DaemonGroup() - - testdir = teuthology.get_testdir(ctx) - if config.get('coverage'): - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - log.info('Creating coverage directory...') - run.wait( - ctx.cluster.run( - args=[ - 'install', '-d', '-m0755', '--', - coverage_dir, - ], - wait=False, - ) - ) - - with contextutil.nested( - lambda: ceph_log(ctx=ctx, config=None), - lambda: valgrind_post(ctx=ctx, config=config), - lambda: cluster(ctx=ctx, config=dict( - conf=config.get('conf', {}), - fs=config.get('fs', None), - mkfs_options=config.get('mkfs_options', None), - mount_options=config.get('mount_options',None), - block_journal=config.get('block_journal', None), - tmpfs_journal=config.get('tmpfs_journal', None), - log_whitelist=config.get('log-whitelist', []), - cpu_profile=set(config.get('cpu_profile', [])), - )), - lambda: run_daemon(ctx=ctx, config=config, type_='mon'), - lambda: run_daemon(ctx=ctx, config=config, type_='osd'), - lambda: cephfs_setup(ctx=ctx, config=config), - lambda: run_daemon(ctx=ctx, config=config, type_='mds'), - ): - try: - if config.get('wait-for-healthy', True): - healthy(ctx=ctx, config=None) - yield - finally: - if config.get('wait-for-scrub', True): - osd_scrub_pgs(ctx, config) diff --git a/teuthology/task/ceph_client.py b/teuthology/task/ceph_client.py deleted file mode 100644 index 8935fc8719..0000000000 --- a/teuthology/task/ceph_client.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Set up client keyring -""" -import logging - -from teuthology import misc as teuthology -from ..orchestra import run - -log = logging.getLogger(__name__) - -def create_keyring(ctx): - """ - Set up key ring on remote sites - """ - log.info('Setting up client nodes...') - clients = ctx.cluster.only(teuthology.is_type('client')) - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - for remote, roles_for_host in clients.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, 'client'): - client_keyring = '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" - '--name=client.{id}'.format(id=id_), - client_keyring, - run.Raw('&&'), - 'sudo', - 'chmod', - '0644', - client_keyring, - ], - ) diff --git a/teuthology/task/ceph_deploy.py b/teuthology/task/ceph_deploy.py deleted file mode 100644 index 9964bab996..0000000000 --- a/teuthology/task/ceph_deploy.py +++ /dev/null @@ -1,478 +0,0 @@ -""" -Execute ceph-deploy as a task -""" -from cStringIO import StringIO - -import contextlib -import os -import time -import logging - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..config import config as teuth_config -import install as install_fn -from ..orchestra import run - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download_ceph_deploy(ctx, config): - """ - Downloads ceph-deploy from the ceph.com git mirror and (by default) - switches to the master branch. If the `ceph-deploy-branch` is specified, it - will use that instead. - """ - log.info('Downloading ceph-deploy...') - testdir = teuthology.get_testdir(ctx) - ceph_admin = teuthology.get_first_mon(ctx, config) - default_cd_branch = {'ceph-deploy-branch': 'master'} - ceph_deploy_branch = config.get( - 'ceph-deploy', - default_cd_branch).get('ceph-deploy-branch') - - ctx.cluster.only(ceph_admin).run( - args=[ - 'git', 'clone', '-b', ceph_deploy_branch, - teuth_config.ceph_git_base_url + 'ceph-deploy.git', - '{tdir}/ceph-deploy'.format(tdir=testdir), - ], - ) - ctx.cluster.only(ceph_admin).run( - args=[ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - - try: - yield - finally: - log.info('Removing ceph-deploy ...') - ctx.cluster.only(ceph_admin).run( - args=[ - 'rm', - '-rf', - '{tdir}/ceph-deploy'.format(tdir=testdir), - ], - ) - - -def is_healthy(ctx, config): - """Wait until a Ceph cluster is healthy.""" - testdir = teuthology.get_testdir(ctx) - ceph_admin = teuthology.get_first_mon(ctx, config) - (remote,) = ctx.cluster.only(ceph_admin).remotes.keys() - max_tries = 90 # 90 tries * 10 secs --> 15 minutes - tries = 0 - while True: - tries += 1 - if tries >= max_tries: - msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes" - raise RuntimeError(msg) - - r = remote.run( - args=[ - 'cd', - '{tdir}'.format(tdir=testdir), - run.Raw('&&'), - 'sudo', 'ceph', - 'health', - ], - stdout=StringIO(), - logger=log.getChild('health'), - ) - out = r.stdout.getvalue() - log.debug('Ceph health: %s', out.rstrip('\n')) - if out.split(None, 1)[0] == 'HEALTH_OK': - break - time.sleep(10) - -def get_nodes_using_roles(ctx, config, role): - """Extract the names of nodes that match a given role from a cluster""" - newl = [] - for _remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, role): - rem = _remote - if role == 'mon': - req1 = str(rem).split('@')[-1] - else: - req = str(rem).split('.')[0] - req1 = str(req).split('@')[1] - newl.append(req1) - return newl - -def get_dev_for_osd(ctx, config): - """Get a list of all osd device names.""" - osd_devs = [] - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - host = remote.name.split('@')[-1] - shortname = host.split('.')[0] - devs = teuthology.get_scratch_devices(remote) - num_osd_per_host = list(teuthology.roles_of_type(roles_for_host, 'osd')) - num_osds = len(num_osd_per_host) - assert num_osds <= len(devs), 'fewer disks than osds on ' + shortname - for dev in devs[:num_osds]: - dev_short = dev.split('/')[-1] - osd_devs.append('{host}:{dev}'.format(host=shortname, dev=dev_short)) - return osd_devs - -def get_all_nodes(ctx, config): - """Return a string of node names separated by blanks""" - nodelist = [] - for t, k in ctx.config['targets'].iteritems(): - host = t.split('@')[-1] - simple_host = host.split('.')[0] - nodelist.append(simple_host) - nodelist = " ".join(nodelist) - return nodelist - -def execute_ceph_deploy(ctx, config, cmd): - """Remotely execute a ceph_deploy command""" - testdir = teuthology.get_testdir(ctx) - ceph_admin = teuthology.get_first_mon(ctx, config) - exec_cmd = cmd - (remote,) = ctx.cluster.only(ceph_admin).remotes.iterkeys() - proc = remote.run( - args = [ - 'cd', - '{tdir}/ceph-deploy'.format(tdir=testdir), - run.Raw('&&'), - run.Raw(exec_cmd), - ], - check_status=False, - ) - exitstatus = proc.exitstatus - return exitstatus - - -@contextlib.contextmanager -def build_ceph_cluster(ctx, config): - """Build a ceph cluster""" - - try: - log.info('Building ceph cluster using ceph-deploy...') - testdir = teuthology.get_testdir(ctx) - ceph_branch = None - if config.get('branch') is not None: - cbranch = config.get('branch') - for var, val in cbranch.iteritems(): - if var == 'testing': - ceph_branch = '--{var}'.format(var=var) - ceph_branch = '--{var}={val}'.format(var=var, val=val) - node_dev_list = [] - all_nodes = get_all_nodes(ctx, config) - mds_nodes = get_nodes_using_roles(ctx, config, 'mds') - mds_nodes = " ".join(mds_nodes) - mon_node = get_nodes_using_roles(ctx, config, 'mon') - mon_nodes = " ".join(mon_node) - new_mon = './ceph-deploy new'+" "+mon_nodes - install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes - purge_nodes = './ceph-deploy purge'+" "+all_nodes - purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes - mon_hostname = mon_nodes.split(' ')[0] - mon_hostname = str(mon_hostname) - gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname - deploy_mds = './ceph-deploy mds create'+" "+mds_nodes - no_of_osds = 0 - - if mon_nodes is None: - raise RuntimeError("no monitor nodes in the config file") - - estatus_new = execute_ceph_deploy(ctx, config, new_mon) - if estatus_new != 0: - raise RuntimeError("ceph-deploy: new command failed") - - log.info('adding config inputs...') - testdir = teuthology.get_testdir(ctx) - conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) - first_mon = teuthology.get_first_mon(ctx, config) - (remote,) = ctx.cluster.only(first_mon).remotes.keys() - - lines = None - if config.get('conf') is not None: - confp = config.get('conf') - for section, keys in confp.iteritems(): - lines = '[{section}]\n'.format(section=section) - teuthology.append_lines_to_file(remote, conf_path, lines, - sudo=True) - for key, value in keys.iteritems(): - log.info("[%s] %s = %s" % (section, key, value)) - lines = '{key} = {value}\n'.format(key=key, value=value) - teuthology.append_lines_to_file(remote, conf_path, lines, - sudo=True) - - estatus_install = execute_ceph_deploy(ctx, config, install_nodes) - if estatus_install != 0: - raise RuntimeError("ceph-deploy: Failed to install ceph") - - mon_no = None - mon_no = config.get('mon_initial_members') - if mon_no is not None: - i = 0 - mon1 = [] - while(i < mon_no): - mon1.append(mon_node[i]) - i = i + 1 - initial_mons = " ".join(mon1) - for k in range(mon_no, len(mon_node)): - mon_create_nodes = './ceph-deploy mon create' + " " + \ - initial_mons + " " + mon_node[k] - estatus_mon = execute_ceph_deploy(ctx, config, - mon_create_nodes) - if estatus_mon != 0: - raise RuntimeError("ceph-deploy: Failed to create monitor") - else: - mon_create_nodes = './ceph-deploy mon create-initial' - estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) - if estatus_mon != 0: - raise RuntimeError("ceph-deploy: Failed to create monitors") - - estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) - max_gather_tries = 90 - gather_tries = 0 - while (estatus_gather != 0): - gather_tries += 1 - if gather_tries >= max_gather_tries: - msg = 'ceph-deploy was not able to gatherkeys after 15 minutes' - raise RuntimeError(msg) - estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) - time.sleep(10) - - if mds_nodes: - estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) - if estatus_mds != 0: - raise RuntimeError("ceph-deploy: Failed to deploy mds") - - if config.get('test_mon_destroy') is not None: - for d in range(1, len(mon_node)): - mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d] - estatus_mon_d = execute_ceph_deploy(ctx, config, - mon_destroy_nodes) - if estatus_mon_d != 0: - raise RuntimeError("ceph-deploy: Failed to delete monitor") - - node_dev_list = get_dev_for_osd(ctx, config) - for d in node_dev_list: - osd_create_cmds = './ceph-deploy osd create --zap-disk'+" "+d - estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) - if estatus_osd == 0: - log.info('successfully created osd') - no_of_osds += 1 - else: - zap_disk = './ceph-deploy disk zap'+" "+d - execute_ceph_deploy(ctx, config, zap_disk) - estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) - if estatus_osd == 0: - log.info('successfully created osd') - no_of_osds += 1 - else: - raise RuntimeError("ceph-deploy: Failed to create osds") - - if config.get('wait-for-healthy', True) and no_of_osds >= 2: - is_healthy(ctx=ctx, config=None) - - log.info('Setting up client nodes...') - conf_path = '/etc/ceph/ceph.conf' - admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' - first_mon = teuthology.get_first_mon(ctx, config) - (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() - conf_data = teuthology.get_file( - remote=mon0_remote, - path=conf_path, - sudo=True, - ) - admin_keyring = teuthology.get_file( - remote=mon0_remote, - path=admin_keyring_path, - sudo=True, - ) - - clients = ctx.cluster.only(teuthology.is_type('client')) - for remot, roles_for_host in clients.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, 'client'): - client_keyring = \ - '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) - mon0_remote.run( - args=[ - 'cd', - '{tdir}'.format(tdir=testdir), - run.Raw('&&'), - 'sudo', 'bash', '-c', - run.Raw('"'), 'ceph', - 'auth', - 'get-or-create', - 'client.{id}'.format(id=id_), - 'mds', 'allow', - 'mon', 'allow *', - 'osd', 'allow *', - run.Raw('>'), - client_keyring, - run.Raw('"'), - ], - ) - key_data = teuthology.get_file( - remote=mon0_remote, - path=client_keyring, - sudo=True, - ) - teuthology.sudo_write_file( - remote=remot, - path=client_keyring, - data=key_data, - perms='0644' - ) - teuthology.sudo_write_file( - remote=remot, - path=admin_keyring_path, - data=admin_keyring, - perms='0644' - ) - teuthology.sudo_write_file( - remote=remot, - path=conf_path, - data=conf_data, - perms='0644' - ) - else: - raise RuntimeError( - "The cluster is NOT operational due to insufficient OSDs") - yield - - finally: - log.info('Stopping ceph...') - ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'ceph', 'stop' ]) - - # Are you really not running anymore? - # try first with the init tooling - ctx.cluster.run(args=['sudo', 'status', 'ceph-all', run.Raw('||'), - 'sudo', 'service', 'status', 'ceph-all']) - - # and now just check for the processes themselves, as if upstart/sysvinit - # is lying to us. Ignore errors if the grep fails - ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), - 'grep', '-v', 'grep', run.Raw('|'), - 'grep', 'ceph'], check_status=False) - - if ctx.archive is not None: - # archive mon data, too - log.info('Archiving mon data...') - path = os.path.join(ctx.archive, 'data') - os.makedirs(path) - mons = ctx.cluster.only(teuthology.is_type('mon')) - for remote, roles in mons.remotes.iteritems(): - for role in roles: - if role.startswith('mon.'): - teuthology.pull_directory_tarball( - remote, - '/var/lib/ceph/mon', - path + '/' + role + '.tgz') - - log.info('Compressing logs...') - run.wait( - ctx.cluster.run( - args=[ - 'sudo', - 'find', - '/var/log/ceph', - '-name', - '*.log', - '-print0', - run.Raw('|'), - 'sudo', - 'xargs', - '-0', - '--no-run-if-empty', - '--', - 'gzip', - '--', - ], - wait=False, - ), - ) - - log.info('Archiving logs...') - path = os.path.join(ctx.archive, 'remote') - os.makedirs(path) - for remote in ctx.cluster.remotes.iterkeys(): - sub = os.path.join(path, remote.shortname) - os.makedirs(sub) - teuthology.pull_directory(remote, '/var/log/ceph', - os.path.join(sub, 'log')) - - # Prevent these from being undefined if the try block fails - all_nodes = get_all_nodes(ctx, config) - purge_nodes = './ceph-deploy purge'+" "+all_nodes - purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes - - log.info('Purging package...') - execute_ceph_deploy(ctx, config, purge_nodes) - log.info('Purging data...') - execute_ceph_deploy(ctx, config, purgedata_nodes) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Set up and tear down a Ceph cluster. - - For example:: - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - stable: bobtail - mon_initial_members: 1 - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - dev: master - conf: - mon: - debug mon = 20 - - tasks: - - install: - extras: yes - - ssh_keys: - - ceph-deploy: - branch: - testing: - """ - if config is None: - config = {} - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-deploy', {})) - - assert isinstance(config, dict), \ - "task ceph-deploy only supports a dictionary for configuration" - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-deploy', {})) - - if config.get('branch') is not None: - assert isinstance(config['branch'], dict), 'branch must be a dictionary' - - with contextutil.nested( - lambda: install_fn.ship_utilities(ctx=ctx, config=None), - lambda: download_ceph_deploy(ctx=ctx, config=config), - lambda: build_ceph_cluster(ctx=ctx, config=dict( - conf=config.get('conf', {}), - branch=config.get('branch',{}), - mon_initial_members=config.get('mon_initial_members', None), - test_mon_destroy=config.get('test_mon_destroy', None), - )), - ): - yield diff --git a/teuthology/task/ceph_fuse.py b/teuthology/task/ceph_fuse.py deleted file mode 100644 index 306a4f1112..0000000000 --- a/teuthology/task/ceph_fuse.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Ceph FUSE client task -""" - -import contextlib -import logging - -from teuthology import misc as teuthology -from ..orchestra import run -from teuthology.task.cephfs.fuse_mount import FuseMount - -log = logging.getLogger(__name__) - - -def get_client_configs(ctx, config): - """ - Get a map of the configuration for each FUSE client in the configuration - by combining the configuration of the current task with any global overrides. - - :param ctx: Context instance - :param config: configuration for this task - :return: dict of client name to config or to None - """ - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('ceph-fuse', {})) - - return config - - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a ``ceph-fuse`` client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. This lets you e.g. set up one client with - ``ceph-fuse`` and another with ``kclient``. - - Example that mounts all clients:: - - tasks: - - ceph: - - ceph-fuse: - - interactive: - - Example that uses both ``kclient` and ``ceph-fuse``:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - kclient: [client.1] - - interactive: - - Example that enables valgrind: - - tasks: - - ceph: - - ceph-fuse: - client.0: - valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes] - - interactive: - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting ceph-fuse clients...') - - testdir = teuthology.get_testdir(ctx) - config = get_client_configs(ctx, config) - - clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys())) - - fuse_mounts = {} - for id_, remote in clients: - client_config = config.get("client.%s" % id_) - if client_config is None: - client_config = {} - - fuse_mount = FuseMount(client_config, testdir, id_, remote) - fuse_mounts[id_] = fuse_mount - - fuse_mount.mount() - - for mount in fuse_mounts.values(): - mount.wait_until_mounted() - - ctx.mounts = fuse_mounts - try: - yield fuse_mounts - finally: - log.info('Unmounting ceph-fuse clients...') - for mount in fuse_mounts.values(): - mount.umount() - - run.wait([m.fuse_daemon for m in fuse_mounts.values()], timeout=600) - - for mount in fuse_mounts.values(): - mount.cleanup() diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py deleted file mode 100644 index 951fdc3cba..0000000000 --- a/teuthology/task/ceph_manager.py +++ /dev/null @@ -1,1436 +0,0 @@ -""" -ceph manager -- Thrasher and CephManager objects -""" -from cStringIO import StringIO -import random -import time -import gevent -import json -import threading -from teuthology import misc as teuthology -from teuthology.task import ceph as ceph_task -from teuthology.task.scrub import Scrubber -from teuthology.task_util.rados import cmd_erasure_code_profile - -class Thrasher: - """ - Object used to thrash Ceph - """ - def __init__(self, manager, config, logger=None): - self.ceph_manager = manager - self.ceph_manager.wait_for_clean() - osd_status = self.ceph_manager.get_osd_status() - self.in_osds = osd_status['in'] - self.live_osds = osd_status['live'] - self.out_osds = osd_status['out'] - self.dead_osds = osd_status['dead'] - self.stopping = False - self.logger = logger - self.config = config - self.revive_timeout = self.config.get("revive_timeout", 75) - if self.config.get('powercycle'): - self.revive_timeout += 120 - self.clean_wait = self.config.get('clean_wait', 0) - self.minin = self.config.get("min_in", 3) - - num_osds = self.in_osds + self.out_osds - self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds - if self.logger is not None: - self.log = lambda x: self.logger.info(x) - else: - def tmp(x): - """ - Implement log behavior - """ - print x - self.log = tmp - if self.config is None: - self.config = dict() - # prevent monitor from auto-marking things out while thrasher runs - # try both old and new tell syntax, in case we are testing old code - try: - manager.raw_cluster_cmd('--', 'tell', 'mon.*', 'injectargs', - '--mon-osd-down-out-interval 0') - except Exception: - manager.raw_cluster_cmd('--', 'mon', 'tell', '*', 'injectargs', - '--mon-osd-down-out-interval 0') - self.thread = gevent.spawn(self.do_thrash) - - def kill_osd(self, osd=None, mark_down=False, mark_out=False): - """ - :param osd: Osd to be killed. - :mark_down: Mark down if true. - :mark_out: Mark out if true. - """ - if osd is None: - osd = random.choice(self.live_osds) - self.log("Killing osd %s, live_osds are %s" % (str(osd), str(self.live_osds))) - self.live_osds.remove(osd) - self.dead_osds.append(osd) - self.ceph_manager.kill_osd(osd) - if mark_down: - self.ceph_manager.mark_down_osd(osd) - if mark_out and osd in self.in_osds: - self.out_osd(osd) - - def blackhole_kill_osd(self, osd=None): - """ - If all else fails, kill the osd. - :param osd: Osd to be killed. - """ - if osd is None: - osd = random.choice(self.live_osds) - self.log("Blackholing and then killing osd %s, live_osds are %s" % (str(osd), str(self.live_osds))) - self.live_osds.remove(osd) - self.dead_osds.append(osd) - self.ceph_manager.blackhole_kill_osd(osd) - - def revive_osd(self, osd=None): - """ - Revive the osd. - :param osd: Osd to be revived. - """ - if osd is None: - osd = random.choice(self.dead_osds) - self.log("Reviving osd %s" % (str(osd),)) - self.live_osds.append(osd) - self.dead_osds.remove(osd) - self.ceph_manager.revive_osd(osd, self.revive_timeout) - - def out_osd(self, osd=None): - """ - Mark the osd out - :param osd: Osd to be marked. - """ - if osd is None: - osd = random.choice(self.in_osds) - self.log("Removing osd %s, in_osds are: %s" % (str(osd), str(self.in_osds))) - self.ceph_manager.mark_out_osd(osd) - self.in_osds.remove(osd) - self.out_osds.append(osd) - - def in_osd(self, osd=None): - """ - Mark the osd out - :param osd: Osd to be marked. - """ - if osd is None: - osd = random.choice(self.out_osds) - if osd in self.dead_osds: - return self.revive_osd(osd) - self.log("Adding osd %s" % (str(osd),)) - self.out_osds.remove(osd) - self.in_osds.append(osd) - self.ceph_manager.mark_in_osd(osd) - self.log("Added osd %s"%(str(osd),)) - - def reweight_osd(self, osd=None): - """ - Reweight an osd that is in - :param osd: Osd to be marked. - """ - if osd is None: - osd = random.choice(self.in_osds) - val = random.uniform(.1, 1.0) - self.log("Reweighting osd %s to %s" % (str(osd), str(val))) - self.ceph_manager.raw_cluster_cmd('osd', 'reweight', str(osd), str(val)) - - def primary_affinity(self, osd=None): - if osd is None: - osd = random.choice(self.in_osds) - if random.random() >= .5: - pa = random.random() - elif random.random() >= .5: - pa = 1 - else: - pa = 0 - self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa)) - self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity', str(osd), str(pa)) - - def all_up(self): - """ - Make sure all osds are up and not out. - """ - while len(self.dead_osds) > 0: - self.log("reviving osd") - self.revive_osd() - while len(self.out_osds) > 0: - self.log("inning osd") - self.in_osd() - - def do_join(self): - """ - Break out of this Ceph loop - """ - self.stopping = True - self.thread.get() - - def grow_pool(self): - """ - Increase the size of the pool - """ - pool = self.ceph_manager.get_pool() - self.log("Growing pool %s"%(pool,)) - self.ceph_manager.expand_pool(pool, self.config.get('pool_grow_by', 10), self.max_pgs) - - def fix_pgp_num(self): - """ - Fix number of pgs in pool. - """ - pool = self.ceph_manager.get_pool() - self.log("fixing pg num pool %s"%(pool,)) - self.ceph_manager.set_pool_pgpnum(pool) - - def test_pool_min_size(self): - """ - Kill and revive all osds except one. - """ - self.log("test_pool_min_size") - self.all_up() - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - the_one = random.choice(self.in_osds) - self.log("Killing everyone but %s", the_one) - to_kill = filter(lambda x: x != the_one, self.in_osds) - [self.kill_osd(i) for i in to_kill] - [self.out_osd(i) for i in to_kill] - time.sleep(self.config.get("test_pool_min_size_time", 10)) - self.log("Killing %s" % (the_one,)) - self.kill_osd(the_one) - self.out_osd(the_one) - self.log("Reviving everyone but %s" % (the_one,)) - [self.revive_osd(i) for i in to_kill] - [self.in_osd(i) for i in to_kill] - self.log("Revived everyone but %s" % (the_one,)) - self.log("Waiting for clean") - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - - def inject_pause(self, conf_key, duration, check_after, should_be_down): - """ - Pause injection testing. Check for osd being down when finished. - """ - the_one = random.choice(self.live_osds) - self.log("inject_pause on {osd}".format(osd = the_one)) - self.log( - "Testing {key} pause injection for duration {duration}".format( - key = conf_key, - duration = duration - )) - self.log( - "Checking after {after}, should_be_down={shouldbedown}".format( - after = check_after, - shouldbedown = should_be_down - )) - self.ceph_manager.set_config(the_one, **{conf_key:duration}) - if not should_be_down: - return - time.sleep(check_after) - status = self.ceph_manager.get_osd_status() - assert the_one in status['down'] - time.sleep(duration - check_after + 20) - status = self.ceph_manager.get_osd_status() - assert not the_one in status['down'] - - def test_backfill_full(self): - """ - Test backfills stopping when the replica fills up. - - First, use osd_backfill_full_ratio to simulate a now full - osd by setting it to 0 on all of the OSDs. - - Second, on a random subset, set - osd_debug_skip_full_check_in_backfill_reservation to force - the more complicated check in do_scan to be exercised. - - Then, verify that all backfills stop. - """ - self.log("injecting osd_backfill_full_ratio = 0") - for i in self.live_osds: - self.ceph_manager.set_config( - i, - osd_debug_skip_full_check_in_backfill_reservation = random.choice( - ['false', 'true']), - osd_backfill_full_ratio = 0) - for i in range(30): - status = self.ceph_manager.compile_pg_status() - if 'backfill' not in status.keys(): - break - self.log( - "waiting for {still_going} backfills".format( - still_going=status.get('backfill'))) - time.sleep(1) - assert('backfill' not in self.ceph_manager.compile_pg_status().keys()) - for i in self.live_osds: - self.ceph_manager.set_config( - i, - osd_debug_skip_full_check_in_backfill_reservation = \ - 'false', - osd_backfill_full_ratio = 0.85) - - def test_map_discontinuity(self): - """ - 1) Allows the osds to recover - 2) kills an osd - 3) allows the remaining osds to recover - 4) waits for some time - 5) revives the osd - This sequence should cause the revived osd to have to handle - a map gap since the mons would have trimmed - """ - while len(self.in_osds) < (self.minin + 1): - self.in_osd() - self.log("Waiting for recovery") - self.ceph_manager.wait_for_all_up( - timeout=self.config.get('timeout') - ) - # now we wait 20s for the pg status to change, if it takes longer, - # the test *should* fail! - time.sleep(20) - self.ceph_manager.wait_for_clean( - timeout=self.config.get('timeout') - ) - - # now we wait 20s for the backfill replicas to hear about the clean - time.sleep(20) - self.log("Recovered, killing an osd") - self.kill_osd(mark_down=True, mark_out=True) - self.log("Waiting for clean again") - self.ceph_manager.wait_for_clean( - timeout=self.config.get('timeout') - ) - self.log("Waiting for trim") - time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40))) - self.revive_osd() - - def choose_action(self): - """ - Random action selector. - """ - chance_down = self.config.get('chance_down', 0.4) - chance_test_min_size = self.config.get('chance_test_min_size', 0) - chance_test_backfill_full = self.config.get('chance_test_backfill_full', 0) - if isinstance(chance_down, int): - chance_down = float(chance_down) / 100 - minin = self.minin - minout = self.config.get("min_out", 0) - minlive = self.config.get("min_live", 2) - mindead = self.config.get("min_dead", 0) - - self.log('choose_action: min_in %d min_out %d min_live %d min_dead %d' % - (minin, minout, minlive, mindead)) - actions = [] - if len(self.in_osds) > minin: - actions.append((self.out_osd, 1.0,)) - if len(self.live_osds) > minlive and chance_down > 0: - actions.append((self.kill_osd, chance_down,)) - if len(self.out_osds) > minout: - actions.append((self.in_osd, 1.7,)) - if len(self.dead_osds) > mindead: - actions.append((self.revive_osd, 1.0,)) - if self.config.get('thrash_primary_affinity', True): - actions.append((self.primary_affinity, 1.0,)) - actions.append((self.reweight_osd, self.config.get('reweight_osd',.5),)) - actions.append((self.grow_pool, self.config.get('chance_pgnum_grow', 0),)) - actions.append((self.fix_pgp_num, self.config.get('chance_pgpnum_fix', 0),)) - actions.append((self.test_pool_min_size, chance_test_min_size,)) - actions.append((self.test_backfill_full, chance_test_backfill_full,)) - for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: - for scenario in [ - (lambda: self.inject_pause(key, - self.config.get('pause_short', 3), - 0, - False), - self.config.get('chance_inject_pause_short', 1),), - (lambda: self.inject_pause(key, - self.config.get('pause_long', 80), - self.config.get('pause_check_after', 70), - True), - self.config.get('chance_inject_pause_long', 0),)]: - actions.append(scenario) - - total = sum([y for (x, y) in actions]) - val = random.uniform(0, total) - for (action, prob) in actions: - if val < prob: - return action - val -= prob - return None - - def do_thrash(self): - """ - Loop to select random actions to thrash ceph manager with. - """ - cleanint = self.config.get("clean_interval", 60) - scrubint = self.config.get("scrub_interval", -1) - maxdead = self.config.get("max_dead", 0) - delay = self.config.get("op_delay", 5) - self.log("starting do_thrash") - while not self.stopping: - self.log(" ".join([str(x) for x in ["in_osds: ", self.in_osds, " out_osds: ", self.out_osds, - "dead_osds: ", self.dead_osds, "live_osds: ", - self.live_osds]])) - if random.uniform(0, 1) < (float(delay) / cleanint): - while len(self.dead_osds) > maxdead: - self.revive_osd() - for osd in self.in_osds: - self.ceph_manager.raw_cluster_cmd('osd', 'reweight', - str(osd), str(1)) - if random.uniform(0, 1) < float( - self.config.get('chance_test_map_discontinuity', 0)): - self.test_map_discontinuity() - else: - self.ceph_manager.wait_for_recovery( - timeout=self.config.get('timeout') - ) - time.sleep(self.clean_wait) - if scrubint > 0: - if random.uniform(0, 1) < (float(delay) / scrubint): - self.log('Scrubbing while thrashing being performed') - Scrubber(self.ceph_manager, self.config) - self.choose_action()() - time.sleep(delay) - self.all_up() - -class CephManager: - """ - Ceph manager object. - Contains several local functions that form a bulk of this module. - """ - def __init__(self, controller, ctx=None, config=None, logger=None): - self.lock = threading.RLock() - self.ctx = ctx - self.config = config - self.controller = controller - self.next_pool_id = 0 - if (logger): - self.log = lambda x: logger.info(x) - else: - def tmp(x): - """ - implement log behavior. - """ - print x - self.log = tmp - if self.config is None: - self.config = dict() - pools = self.list_pools() - self.pools = {} - for pool in pools: - self.pools[pool] = self.get_pool_property(pool, 'pg_num') - - def raw_cluster_cmd(self, *args): - """ - Start ceph on a raw cluster. Return count - """ - testdir = teuthology.get_testdir(self.ctx) - ceph_args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - ] - ceph_args.extend(args) - proc = self.controller.run( - args=ceph_args, - stdout=StringIO(), - ) - return proc.stdout.getvalue() - - def raw_cluster_cmd_result(self, *args): - """ - Start ceph on a cluster. Return success or failure information. - """ - testdir = teuthology.get_testdir(self.ctx) - ceph_args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - ] - ceph_args.extend(args) - proc = self.controller.run( - args=ceph_args, - check_status=False, - ) - return proc.exitstatus - - def do_rados(self, remote, cmd): - """ - Execute a remote rados command. - """ - testdir = teuthology.get_testdir(self.ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ] - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=True, - ) - return proc - - def rados_write_objects( - self, pool, num_objects, size, timelimit, threads, cleanup=False): - """ - Write rados objects - Threads not used yet. - """ - args = [ - '-p', pool, - '--num-objects', num_objects, - '-b', size, - 'bench', timelimit, - 'write' - ] - if not cleanup: args.append('--no-cleanup') - return self.do_rados(self.controller, map(str, args)) - - def do_put(self, pool, obj, fname): - """ - Implement rados put operation - """ - return self.do_rados( - self.controller, - [ - '-p', - pool, - 'put', - obj, - fname - ] - ) - - def do_get(self, pool, obj, fname='/dev/null'): - """ - Implement rados get operation - """ - return self.do_rados( - self.controller, - [ - '-p', - pool, - 'stat', - obj, - fname - ] - ) - - def osd_admin_socket(self, osd_id, command, check_status=True): - return self.admin_socket('osd', osd_id, command, check_status) - - def admin_socket(self, service_type, service_id, command, check_status=True): - """ - Remotely start up ceph specifying the admin socket - """ - testdir = teuthology.get_testdir(self.ctx) - remote = None - for _remote, roles_for_host in self.ctx.cluster.remotes.iteritems(): - for id_ in teuthology.roles_of_type(roles_for_host, service_type): - if id_ == str(service_id): - remote = _remote - assert remote is not None - args = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph', - '--admin-daemon', - '/var/run/ceph/ceph-{type}.{id}.asok'.format( - type=service_type, - id=service_id), - ] - args.extend(command) - return remote.run( - args=args, - stdout=StringIO(), - wait=True, - check_status=check_status - ) - - def get_pgid(self, pool, pgnum): - """ - :param pool: pool number - :param pgnum: pg number - :returns: a string representing this pg. - """ - poolnum = self.get_pool_num(pool) - pg_str = "{poolnum}.{pgnum}".format( - poolnum=poolnum, - pgnum=pgnum) - return pg_str - - def get_pg_replica(self, pool, pgnum): - """ - get replica for pool, pgnum (e.g. (data, 0)->0 - """ - output = self.raw_cluster_cmd("pg", "dump", '--format=json') - j = json.loads('\n'.join(output.split('\n')[1:])) - pg_str = self.get_pgid(pool, pgnum) - for pg in j['pg_stats']: - if pg['pgid'] == pg_str: - return int(pg['acting'][-1]) - assert False - - def get_pg_primary(self, pool, pgnum): - """ - get primary for pool, pgnum (e.g. (data, 0)->0 - """ - output = self.raw_cluster_cmd("pg", "dump", '--format=json') - j = json.loads('\n'.join(output.split('\n')[1:])) - pg_str = self.get_pgid(pool, pgnum) - for pg in j['pg_stats']: - if pg['pgid'] == pg_str: - return int(pg['acting'][0]) - assert False - - def get_pool_num(self, pool): - """ - get number for pool (e.g., data -> 2) - """ - out = self.raw_cluster_cmd('osd', 'dump', '--format=json') - j = json.loads('\n'.join(out.split('\n')[1:])) - for i in j['pools']: - if i['pool_name'] == pool: - return int(i['pool']) - assert False - - def list_pools(self): - """ - list all pool names - """ - out = self.raw_cluster_cmd('osd', 'dump', '--format=json') - j = json.loads('\n'.join(out.split('\n')[1:])) - self.log(j['pools']) - return [str(i['pool_name']) for i in j['pools']] - - def clear_pools(self): - """ - remove all pools - """ - [self.remove_pool(i) for i in self.list_pools()] - - def kick_recovery_wq(self, osdnum): - """ - Run kick_recovery_wq on cluster. - """ - return self.raw_cluster_cmd( - 'tell', "osd.%d" % (int(osdnum),), - 'debug', - 'kick_recovery_wq', - '0') - - def wait_run_admin_socket(self, service_type, service_id, args=['version'], timeout=75): - """ - If osd_admin_socket call suceeds, return. Otherwise wait - five seconds and try again. - """ - tries = 0 - while True: - proc = self.admin_socket(service_type, service_id, args, check_status=False) - if proc.exitstatus is 0: - break - else: - tries += 1 - if (tries * 5) > timeout: - raise Exception('timed out waiting for admin_socket to appear after {type}.{id} restart'.format( - type=service_type, - id=service_id)) - self.log( - "waiting on admin_socket for {type}-{id}, {command}".format( - type=service_type, - id=service_id, - command=args)) - time.sleep(5) - - def set_config(self, osdnum, **argdict): - """ - :param osdnum: osd number - :param argdict: dictionary containing values to set. - """ - for k, v in argdict.iteritems(): - self.wait_run_admin_socket( - 'osd', osdnum, - ['config', 'set', str(k), str(v)]) - - def raw_cluster_status(self): - """ - Get status from cluster - """ - status = self.raw_cluster_cmd('status', '--format=json-pretty') - return json.loads(status) - - def raw_osd_status(self): - """ - Get osd status from cluster - """ - return self.raw_cluster_cmd('osd', 'dump') - - def get_osd_status(self): - """ - Get osd statuses sorted by states that the osds are in. - """ - osd_lines = filter( - lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)), - self.raw_osd_status().split('\n')) - self.log(osd_lines) - in_osds = [int(i[4:].split()[0]) for i in filter( - lambda x: " in " in x, - osd_lines)] - out_osds = [int(i[4:].split()[0]) for i in filter( - lambda x: " out " in x, - osd_lines)] - up_osds = [int(i[4:].split()[0]) for i in filter( - lambda x: " up " in x, - osd_lines)] - down_osds = [int(i[4:].split()[0]) for i in filter( - lambda x: " down " in x, - osd_lines)] - dead_osds = [int(x.id_) for x in - filter(lambda x: not x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))] - live_osds = [int(x.id_) for x in - filter(lambda x: x.running(), self.ctx.daemons.iter_daemons_of_role('osd'))] - return { 'in' : in_osds, 'out' : out_osds, 'up' : up_osds, - 'down' : down_osds, 'dead' : dead_osds, 'live' : live_osds, - 'raw' : osd_lines} - - def get_num_pgs(self): - """ - Check cluster status for the number of pgs - """ - status = self.raw_cluster_status() - self.log(status) - return status['pgmap']['num_pgs'] - - def create_erasure_code_profile(self, profile_name, profile): - """ - Create an erasure code profile name that can be used as a parameter - when creating an erasure coded pool. - """ - with self.lock: - args = cmd_erasure_code_profile(profile_name, profile) - self.raw_cluster_cmd(*args) - - def create_pool_with_unique_name(self, pg_num=16, erasure_code_profile_name=None): - """ - Create a pool named unique_pool_X where X is unique. - """ - name = "" - with self.lock: - name = "unique_pool_%s" % (str(self.next_pool_id),) - self.next_pool_id += 1 - self.create_pool( - name, - pg_num, - erasure_code_profile_name=erasure_code_profile_name) - return name - - def create_pool(self, pool_name, pg_num=16, erasure_code_profile_name=None): - """ - Create a pool named from the pool_name parameter. - :param pool_name: name of the pool being created. - :param pg_num: initial number of pgs. - :param erasure_code_profile_name: if set and !None create an erasure coded pool using the profile - """ - with self.lock: - assert isinstance(pool_name, str) - assert isinstance(pg_num, int) - assert pool_name not in self.pools - self.log("creating pool_name %s"%(pool_name,)) - if erasure_code_profile_name: - self.raw_cluster_cmd('osd', 'pool', 'create', pool_name, str(pg_num), str(pg_num), 'erasure', erasure_code_profile_name) - else: - self.raw_cluster_cmd('osd', 'pool', 'create', pool_name, str(pg_num)) - self.pools[pool_name] = pg_num - - def remove_pool(self, pool_name): - """ - Remove the indicated pool - :param pool_name: Pool to be removed - """ - with self.lock: - assert isinstance(pool_name, str) - assert pool_name in self.pools - self.log("removing pool_name %s" % (pool_name,)) - del self.pools[pool_name] - self.do_rados( - self.controller, - ['rmpool', pool_name, pool_name, "--yes-i-really-really-mean-it"] - ) - - def get_pool(self): - """ - Pick a random pool - """ - with self.lock: - return random.choice(self.pools.keys()) - - def get_pool_pg_num(self, pool_name): - """ - Return the number of pgs in the pool specified. - """ - with self.lock: - assert isinstance(pool_name, str) - if pool_name in self.pools: - return self.pools[pool_name] - return 0 - - def get_pool_property(self, pool_name, prop): - """ - :param pool_name: pool - :param prop: property to be checked. - :returns: property as an int value. - """ - with self.lock: - assert isinstance(pool_name, str) - assert isinstance(prop, str) - output = self.raw_cluster_cmd( - 'osd', - 'pool', - 'get', - pool_name, - prop) - return int(output.split()[1]) - - def set_pool_property(self, pool_name, prop, val): - """ - :param pool_name: pool - :param prop: property to be set. - :param val: value to set. - - This routine retries if set operation fails. - """ - with self.lock: - assert isinstance(pool_name, str) - assert isinstance(prop, str) - assert isinstance(val, int) - tries = 0 - while True: - r = self.raw_cluster_cmd_result( - 'osd', - 'pool', - 'set', - pool_name, - prop, - str(val)) - if r != 11: # EAGAIN - break - tries += 1 - if tries > 50: - raise Exception('timed out getting EAGAIN when setting pool property %s %s = %s' % (pool_name, prop, val)) - self.log('got EAGAIN setting pool property, waiting a few seconds...') - time.sleep(2) - - def expand_pool(self, pool_name, by, max_pgs): - """ - Increase the number of pgs in a pool - """ - with self.lock: - assert isinstance(pool_name, str) - assert isinstance(by, int) - assert pool_name in self.pools - if self.get_num_creating() > 0: - return - if (self.pools[pool_name] + by) > max_pgs: - return - self.log("increase pool size by %d"%(by,)) - new_pg_num = self.pools[pool_name] + by - self.set_pool_property(pool_name, "pg_num", new_pg_num) - self.pools[pool_name] = new_pg_num - - def set_pool_pgpnum(self, pool_name): - """ - Set pgpnum property of pool_name pool. - """ - with self.lock: - assert isinstance(pool_name, str) - assert pool_name in self.pools - if self.get_num_creating() > 0: - return - self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name]) - - def list_pg_missing(self, pgid): - """ - return list of missing pgs with the id specified - """ - r = None - offset = {} - while True: - out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_missing', - json.dumps(offset)) - j = json.loads(out) - if r is None: - r = j - else: - r['objects'].extend(j['objects']) - if not 'more' in j: - break - if j['more'] == 0: - break - offset = j['objects'][-1]['oid'] - if 'more' in r: - del r['more'] - return r - - def get_pg_stats(self): - """ - Dump the cluster and get pg stats - """ - out = self.raw_cluster_cmd('pg', 'dump', '--format=json') - j = json.loads('\n'.join(out.split('\n')[1:])) - return j['pg_stats'] - - def compile_pg_status(self): - """ - Return a histogram of pg state values - """ - ret = {} - j = self.get_pg_stats() - for pg in j: - for status in pg['state'].split('+'): - if status not in ret: - ret[status] = 0 - ret[status] += 1 - return ret - - def pg_scrubbing(self, pool, pgnum): - """ - pg scrubbing wrapper - """ - pgstr = self.get_pgid(pool, pgnum) - stats = self.get_single_pg_stats(pgstr) - return 'scrub' in stats['state'] - - def pg_repairing(self, pool, pgnum): - """ - pg repairing wrapper - """ - pgstr = self.get_pgid(pool, pgnum) - stats = self.get_single_pg_stats(pgstr) - return 'repair' in stats['state'] - - def pg_inconsistent(self, pool, pgnum): - """ - pg inconsistent wrapper - """ - pgstr = self.get_pgid(pool, pgnum) - stats = self.get_single_pg_stats(pgstr) - return 'inconsistent' in stats['state'] - - def get_last_scrub_stamp(self, pool, pgnum): - """ - Get the timestamp of the last scrub. - """ - stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum)) - return stats["last_scrub_stamp"] - - def do_pg_scrub(self, pool, pgnum, stype): - """ - Scrub pg and wait for scrubbing to finish - """ - init = self.get_last_scrub_stamp(pool, pgnum) - self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum)) - while init == self.get_last_scrub_stamp(pool, pgnum): - self.log("waiting for scrub type %s"%(stype,)) - time.sleep(10) - - def get_single_pg_stats(self, pgid): - """ - Return pg for the pgid specified. - """ - all_stats = self.get_pg_stats() - - for pg in all_stats: - if pg['pgid'] == pgid: - return pg - - return None - - def get_osd_dump(self): - """ - Dump osds - :returns: all osds - """ - out = self.raw_cluster_cmd('osd', 'dump', '--format=json') - j = json.loads('\n'.join(out.split('\n')[1:])) - return j['osds'] - - def get_stuck_pgs(self, type_, threshold): - """ - :returns: stuck pg information from the cluster - """ - out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold), - '--format=json') - return json.loads(out) - - def get_num_unfound_objects(self): - """ - Check cluster status to get the number of unfound objects - """ - status = self.raw_cluster_status() - self.log(status) - return status['pgmap'].get('unfound_objects', 0) - - def get_num_creating(self): - """ - Find the number of pgs in creating mode. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if 'creating' in pg['state']: - num += 1 - return num - - def get_num_active_clean(self): - """ - Find the number of active and clean pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if pg['state'].count('active') and pg['state'].count('clean') and not pg['state'].count('stale'): - num += 1 - return num - - def get_num_active_recovered(self): - """ - Find the number of active and recovered pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if pg['state'].count('active') and not pg['state'].count('recover') and not pg['state'].count('backfill') and not pg['state'].count('stale'): - num += 1 - return num - - def get_is_making_recovery_progress(self): - """ - Return whether there is recovery progress discernable in the - raw cluster status - """ - status = self.raw_cluster_status() - kps = status['pgmap'].get('recovering_keys_per_sec', 0) - bps = status['pgmap'].get('recovering_bytes_per_sec', 0) - ops = status['pgmap'].get('recovering_objects_per_sec', 0) - return kps > 0 or bps > 0 or ops > 0 - - def get_num_active(self): - """ - Find the number of active pgs. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if pg['state'].count('active') and not pg['state'].count('stale'): - num += 1 - return num - - def get_num_down(self): - """ - Find the number of pgs that are down. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if (pg['state'].count('down') and not pg['state'].count('stale')) or \ - (pg['state'].count('incomplete') and not pg['state'].count('stale')): - num += 1 - return num - - def get_num_active_down(self): - """ - Find the number of pgs that are either active or down. - """ - pgs = self.get_pg_stats() - num = 0 - for pg in pgs: - if (pg['state'].count('active') and not pg['state'].count('stale')) or \ - (pg['state'].count('down') and not pg['state'].count('stale')) or \ - (pg['state'].count('incomplete') and not pg['state'].count('stale')): - num += 1 - return num - - def is_clean(self): - """ - True if all pgs are clean - """ - return self.get_num_active_clean() == self.get_num_pgs() - - def is_recovered(self): - """ - True if all pgs have recovered - """ - return self.get_num_active_recovered() == self.get_num_pgs() - - def is_active_or_down(self): - """ - True if all pgs are active or down - """ - return self.get_num_active_down() == self.get_num_pgs() - - def wait_for_clean(self, timeout=None): - """ - Returns trues when all pgs are clean. - """ - self.log("waiting for clean") - start = time.time() - num_active_clean = self.get_num_active_clean() - while not self.is_clean(): - if timeout is not None: - if self.get_is_making_recovery_progress(): - self.log("making progress, resetting timeout") - start = time.time() - else: - self.log("no progress seen, keeping timeout for now") - assert time.time() - start < timeout, \ - 'failed to become clean before timeout expired' - cur_active_clean = self.get_num_active_clean() - if cur_active_clean != num_active_clean: - start = time.time() - num_active_clean = cur_active_clean - time.sleep(3) - self.log("clean!") - - def are_all_osds_up(self): - """ - Returns true if all osds are up. - """ - x = self.get_osd_dump() - return (len(x) == \ - sum([(y['up'] > 0) for y in x])) - - def wait_for_all_up(self, timeout=None): - """ - When this exits, either the timeout has expired, or all - osds are up. - """ - self.log("waiting for all up") - start = time.time() - while not self.are_all_osds_up(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'timeout expired in wait_for_all_up' - time.sleep(3) - self.log("all up!") - - def wait_for_recovery(self, timeout=None): - """ - Check peering. When this exists, we have recovered. - """ - self.log("waiting for recovery to complete") - start = time.time() - num_active_recovered = self.get_num_active_recovered() - while not self.is_recovered(): - if timeout is not None: - if self.get_is_making_recovery_progress(): - self.log("making progress, resetting timeout") - start = time.time() - else: - self.log("no progress seen, keeping timeout for now") - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - cur_active_recovered = self.get_num_active_recovered() - if cur_active_recovered != num_active_recovered: - start = time.time() - num_active_recovered = cur_active_recovered - time.sleep(3) - self.log("recovered!") - - def wait_for_active(self, timeout=None): - """ - Check peering. When this exists, we are definitely active - """ - self.log("waiting for peering to complete") - start = time.time() - num_active = self.get_num_active() - while not self.is_active(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - cur_active = self.get_num_active() - if cur_active != num_active: - start = time.time() - num_active = cur_active - time.sleep(3) - self.log("active!") - - def wait_for_active_or_down(self, timeout=None): - """ - Check peering. When this exists, we are definitely either - active or down - """ - self.log("waiting for peering to complete or become blocked") - start = time.time() - num_active_down = self.get_num_active_down() - while not self.is_active_or_down(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'failed to recover before timeout expired' - cur_active_down = self.get_num_active_down() - if cur_active_down != num_active_down: - start = time.time() - num_active_down = cur_active_down - time.sleep(3) - self.log("active or down!") - - def osd_is_up(self, osd): - """ - Wrapper for osd check - """ - osds = self.get_osd_dump() - return osds[osd]['up'] > 0 - - def wait_till_osd_is_up(self, osd, timeout=None): - """ - Loop waiting for osd. - """ - self.log('waiting for osd.%d to be up' % osd) - start = time.time() - while not self.osd_is_up(osd): - if timeout is not None: - assert time.time() - start < timeout, \ - 'osd.%d failed to come up before timeout expired' % osd - time.sleep(3) - self.log('osd.%d is up' % osd) - - def is_active(self): - """ - Wrapper to check if active - """ - return self.get_num_active() == self.get_num_pgs() - - def wait_till_active(self, timeout=None): - """ - Wait until osds are active. - """ - self.log("waiting till active") - start = time.time() - while not self.is_active(): - if timeout is not None: - assert time.time() - start < timeout, \ - 'failed to become active before timeout expired' - time.sleep(3) - self.log("active!") - - def mark_out_osd(self, osd): - """ - Wrapper to mark osd out. - """ - self.raw_cluster_cmd('osd', 'out', str(osd)) - - def kill_osd(self, osd): - """ - Kill osds by either power cycling (if indicated by the config) - or by stopping. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() - self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_off() - else: - self.ctx.daemons.get_daemon('osd', osd).stop() - - def blackhole_kill_osd(self, osd): - """ - Stop osd if nothing else works. - """ - self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd, - 'injectargs', '--filestore-blackhole') - time.sleep(2) - self.ctx.daemons.get_daemon('osd', osd).stop() - - def revive_osd(self, osd, timeout=150): - """ - Revive osds by either power cycling (if indicated by the config) - or by restarting. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() - self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_on() - if not remote.console.check_status(300): - raise Exception('Failed to revive osd.{o} via ipmi'.format(o=osd)) - teuthology.reconnect(self.ctx, 60, [remote]) - ceph_task.mount_osd_data(self.ctx, remote, str(osd)) - ceph_task.make_admin_daemon_dir(self.ctx, remote) - self.ctx.daemons.get_daemon('osd', osd).reset() - self.ctx.daemons.get_daemon('osd', osd).restart() - # wait for dump_ops_in_flight; this command doesn't appear - # until after the signal handler is installed and it is safe - # to stop the osd again without making valgrind leak checks - # unhappy. see #5924. - self.wait_run_admin_socket('osd', osd, - args=['dump_ops_in_flight'], - timeout=timeout) - - def mark_down_osd(self, osd): - """ - Cluster command wrapper - """ - self.raw_cluster_cmd('osd', 'down', str(osd)) - - def mark_in_osd(self, osd): - """ - Cluster command wrapper - """ - self.raw_cluster_cmd('osd', 'in', str(osd)) - - - ## monitors - - def signal_mon(self, mon, sig): - """ - Wrapper to local get_deamon call - """ - self.ctx.daemons.get_daemon('mon', mon).signal(sig) - - def kill_mon(self, mon): - """ - Kill the monitor by either power cycling (if the config says so), - or by doing a stop. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys() - self.log('kill_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_off() - else: - self.ctx.daemons.get_daemon('mon', mon).stop() - - def revive_mon(self, mon): - """ - Restart by either power cycling (if the config says so), - or by doing a normal restart. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys() - self.log('revive_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_on() - ceph_task.make_admin_daemon_dir(self.ctx, remote) - self.ctx.daemons.get_daemon('mon', mon).restart() - - def get_mon_status(self, mon): - """ - Extract all the monitor status information from the cluster - """ - addr = self.ctx.ceph.conf['mon.%s' % mon]['mon addr'] - out = self.raw_cluster_cmd('-m', addr, 'mon_status') - return json.loads(out) - - def get_mon_quorum(self): - """ - Extract monitor quorum information from the cluster - """ - out = self.raw_cluster_cmd('quorum_status') - j = json.loads(out) - self.log('quorum_status is %s' % out) - return j['quorum'] - - def wait_for_mon_quorum_size(self, size, timeout=300): - """ - Loop until quorum size is reached. - """ - self.log('waiting for quorum size %d' % size) - start = time.time() - while not len(self.get_mon_quorum()) == size: - if timeout is not None: - assert time.time() - start < timeout, \ - 'failed to reach quorum size %d before timeout expired' % size - time.sleep(3) - self.log("quorum is size %d" % size) - - def get_mon_health(self, debug=False): - """ - Extract all the monitor health information. - """ - out = self.raw_cluster_cmd('health', '--format=json') - if debug: - self.log('health:\n{h}'.format(h=out)) - return json.loads(out) - - ## metadata servers - - def kill_mds(self, mds): - """ - Powercyle if set in config, otherwise just stop. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys() - self.log('kill_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_off() - else: - self.ctx.daemons.get_daemon('mds', mds).stop() - - def kill_mds_by_rank(self, rank): - """ - kill_mds wrapper to kill based on rank passed. - """ - status = self.get_mds_status_by_rank(rank) - self.kill_mds(status['name']) - - def revive_mds(self, mds, standby_for_rank=None): - """ - Revive mds -- do an ipmpi powercycle (if indicated by the config) - and then restart (using --hot-standby if specified. - """ - if self.config.get('powercycle'): - (remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys() - self.log('revive_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name)) - assert remote.console is not None, "powercycling requested but RemoteConsole is not initialized. Check ipmi config." - remote.console.power_on() - ceph_task.make_admin_daemon_dir(self.ctx, remote) - args = [] - if standby_for_rank: - args.extend(['--hot-standby', standby_for_rank]) - self.ctx.daemons.get_daemon('mds', mds).restart(*args) - - def revive_mds_by_rank(self, rank, standby_for_rank=None): - """ - revive_mds wrapper to revive based on rank passed. - """ - status = self.get_mds_status_by_rank(rank) - self.revive_mds(status['name'], standby_for_rank) - - def get_mds_status(self, mds): - """ - Run cluster commands for the mds in order to get mds information - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - # collate; for dup ids, larger gid wins. - for info in j['info'].itervalues(): - if info['name'] == mds: - return info - return None - - def get_mds_status_by_rank(self, rank): - """ - Run cluster commands for the mds in order to get mds information - check rank. - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - # collate; for dup ids, larger gid wins. - for info in j['info'].itervalues(): - if info['rank'] == rank: - return info - return None - - def get_mds_status_all(self): - """ - Run cluster command to extract all the mds status. - """ - out = self.raw_cluster_cmd('mds', 'dump', '--format=json') - j = json.loads(' '.join(out.splitlines()[1:])) - return j diff --git a/teuthology/task/cephfs/__init__.py b/teuthology/task/cephfs/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/teuthology/task/cephfs/filesystem.py b/teuthology/task/cephfs/filesystem.py deleted file mode 100644 index 1b83d81554..0000000000 --- a/teuthology/task/cephfs/filesystem.py +++ /dev/null @@ -1,221 +0,0 @@ - -from StringIO import StringIO -import json -import logging -import time - -from teuthology import misc -from teuthology.parallel import parallel -from teuthology.task import ceph_manager - - -log = logging.getLogger(__name__) - - -DAEMON_WAIT_TIMEOUT = 120 - - -class Filesystem(object): - """ - This object is for driving a CephFS filesystem. - - Limitations: - * Assume a single filesystem+cluster - * Assume a single MDS - """ - def __init__(self, ctx, config): - self._ctx = ctx - self._config = config - - self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds')) - if len(self.mds_ids) == 0: - raise RuntimeError("This task requires at least one MDS") - - first_mon = misc.get_first_mon(ctx, config) - (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() - self.mon_manager = ceph_manager.CephManager(mon_remote, ctx=ctx, logger=log.getChild('ceph_manager')) - self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids]) - - client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client')) - self.client_id = client_list[0] - self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1] - - def are_daemons_healthy(self): - """ - Return true if all daemons are in one of active, standby, standby-replay - :return: - """ - status = self.mon_manager.get_mds_status_all() - for mds_id, mds_status in status['info'].items(): - if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]: - log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state'])) - return False - - return True - - def wait_for_daemons(self, timeout=None): - """ - Wait until all daemons are healthy - :return: - """ - - if timeout is None: - timeout = DAEMON_WAIT_TIMEOUT - - elapsed = 0 - while True: - if self.are_daemons_healthy(): - return - else: - time.sleep(1) - elapsed += 1 - - if elapsed > timeout: - raise RuntimeError("Timed out waiting for MDS daemons to become healthy") - - def get_lone_mds_id(self): - if len(self.mds_ids) != 1: - raise ValueError("Explicit MDS argument required when multiple MDSs in use") - else: - return self.mds_ids[0] - - def _one_or_all(self, mds_id, cb): - """ - Call a callback for a single named MDS, or for all - - :param mds_id: MDS daemon name, or None - :param cb: Callback taking single argument of MDS daemon name - """ - if mds_id is None: - with parallel() as p: - for mds_id in self.mds_ids: - p.spawn(cb, mds_id) - else: - cb(mds_id) - - def mds_stop(self, mds_id=None): - """ - Stop the MDS daemon process(se). If it held a rank, that rank - will eventually go laggy. - """ - self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop()) - - def mds_fail(self, mds_id=None): - """ - Inform MDSMonitor of the death of the daemon process(es). If it held - a rank, that rank will be relinquished. - """ - self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_)) - - def mds_restart(self, mds_id=None): - self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart()) - - def mds_fail_restart(self, mds_id=None): - """ - Variation on restart that includes marking MDSs as failed, so that doing this - operation followed by waiting for healthy daemon states guarantees that they - have gone down and come up, rather than potentially seeing the healthy states - that existed before the restart. - """ - def _fail_restart(id_): - self.mds_daemons[id_].stop() - self.mon_manager.raw_cluster_cmd("mds", "fail", id_) - self.mds_daemons[id_].restart() - - self._one_or_all(mds_id, _fail_restart) - - def reset(self): - log.info("Creating new filesystem") - - self.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "0") - for mds_id in self.mds_ids: - assert not self._ctx.daemons.get_daemon('mds', mds_id).running() - self.mon_manager.raw_cluster_cmd_result('mds', 'fail', mds_id) - self.mon_manager.raw_cluster_cmd_result('fs', 'rm', "default", "--yes-i-really-mean-it") - self.mon_manager.raw_cluster_cmd_result('fs', 'new', "default", "metadata", "data") - - def get_metadata_object(self, object_type, object_id): - """ - Retrieve an object from the metadata pool, pass it through - ceph-dencoder to dump it to JSON, and return the decoded object. - """ - temp_bin_path = '/tmp/out.bin' - - self.client_remote.run(args=[ - 'sudo', 'rados', '-p', 'metadata', 'get', object_id, temp_bin_path - ]) - - stdout = StringIO() - self.client_remote.run(args=[ - 'sudo', 'ceph-dencoder', 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json' - ], stdout=stdout) - dump_json = stdout.getvalue().strip() - try: - dump = json.loads(dump_json) - except (TypeError, ValueError): - log.error("Failed to decode JSON: '{0}'".format(dump_json)) - raise - - return dump - - def get_journal_version(self): - """ - Read the JournalPointer and Journal::Header objects to learn the version of - encoding in use. - """ - journal_pointer_object = '400.00000000' - journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object) - journal_ino = journal_pointer_dump['journal_pointer']['front'] - - journal_header_object = "{0:x}.00000000".format(journal_ino) - journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object) - - version = journal_header_dump['journal_header']['stream_format'] - log.info("Read journal version {0}".format(version)) - - return version - - def mds_asok(self, command, mds_id=None): - if mds_id is None: - mds_id = self.get_lone_mds_id() - proc = self.mon_manager.admin_socket('mds', mds_id, command) - response_data = proc.stdout.getvalue() - log.info("mds_asok output: {0}".format(response_data)) - if response_data.strip(): - return json.loads(response_data) - else: - return None - - def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None): - """ - Block until the MDS reaches a particular state, or a failure condition - is met. - - :param goal_state: Return once the MDS is in this state - :param reject: Fail if the MDS enters this state before the goal state - :param timeout: Fail if this many seconds pass before reaching goal - :return: number of seconds waited, rounded down to integer - """ - - if mds_id is None: - mds_id = self.get_lone_mds_id() - - elapsed = 0 - while True: - # mds_info is None if no daemon currently claims this rank - mds_info = self.mon_manager.get_mds_status(mds_id) - current_state = mds_info['state'] if mds_info else None - - if current_state == goal_state: - log.info("reached state '{0}' in {1}s".format(current_state, elapsed)) - return elapsed - elif reject is not None and current_state == reject: - raise RuntimeError("MDS in reject state {0}".format(current_state)) - elif timeout is not None and elapsed > timeout: - raise RuntimeError( - "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format( - elapsed, goal_state, current_state - )) - else: - time.sleep(1) - elapsed += 1 \ No newline at end of file diff --git a/teuthology/task/cephfs/fuse_mount.py b/teuthology/task/cephfs/fuse_mount.py deleted file mode 100644 index 98980573bf..0000000000 --- a/teuthology/task/cephfs/fuse_mount.py +++ /dev/null @@ -1,253 +0,0 @@ - -from StringIO import StringIO -import json -import time -import os -import logging - -from teuthology import misc -from ...orchestra import run -from teuthology.orchestra.run import CommandFailedError -from teuthology.task.cephfs.mount import CephFSMount - -log = logging.getLogger(__name__) - - -class FuseMount(CephFSMount): - def __init__(self, client_config, test_dir, client_id, client_remote): - super(FuseMount, self).__init__(test_dir, client_id, client_remote) - - self.client_config = client_config if client_config else {} - self.fuse_daemon = None - - def mount(self): - log.info("Client client.%s config is %s" % (self.client_id, self.client_config)) - - daemon_signal = 'kill' - if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None: - daemon_signal = 'term' - - mnt = os.path.join(self.test_dir, 'mnt.{id}'.format(id=self.client_id)) - log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format( - id=self.client_id, remote=self.client_remote, mnt=mnt)) - - self.client_remote.run( - args=[ - 'mkdir', - '--', - mnt, - ], - ) - - run_cmd = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - 'daemon-helper', - daemon_signal, - ] - run_cmd_tail = [ - 'ceph-fuse', - '-f', - '--name', 'client.{id}'.format(id=self.client_id), - # TODO ceph-fuse doesn't understand dash dash '--', - mnt, - ] - - if self.client_config.get('valgrind') is not None: - run_cmd = misc.get_valgrind_args( - self.test_dir, - 'client.{id}'.format(id=self.client_id), - run_cmd, - self.client_config.get('valgrind'), - ) - - run_cmd.extend(run_cmd_tail) - - proc = self.client_remote.run( - args=run_cmd, - logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)), - stdin=run.PIPE, - wait=False, - ) - self.fuse_daemon = proc - - def is_mounted(self): - proc = self.client_remote.run( - args=[ - 'stat', - '--file-system', - '--printf=%T\n', - '--', - self.mountpoint, - ], - stdout=StringIO(), - ) - fstype = proc.stdout.getvalue().rstrip('\n') - if fstype == 'fuseblk': - log.info('ceph-fuse is mounted on %s', self.mountpoint) - return True - else: - log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format( - fstype=fstype)) - - def wait_until_mounted(self): - """ - Check to make sure that fuse is mounted on mountpoint. If not, - sleep for 5 seconds and check again. - """ - - while not self.is_mounted(): - # Even if it's not mounted, it should at least - # be running: catch simple failures where it has terminated. - assert not self.fuse_daemon.poll() - - time.sleep(5) - - # Now that we're mounted, set permissions so that the rest of the test will have - # unrestricted access to the filesystem mount. - self.client_remote.run( - args=['sudo', 'chmod', '1777', '{tdir}/mnt.{id}'.format(tdir=self.test_dir, id=self.client_id)], ) - - def umount(self): - try: - self.client_remote.run( - args=[ - 'sudo', - 'fusermount', - '-u', - self.mountpoint, - ], - ) - except run.CommandFailedError: - # FIXME: this will clobber all FUSE mounts, not just this one - - log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) - # abort the fuse mount, killing all hung processes - self.client_remote.run( - args=[ - 'if', 'test', '-e', '/sys/fs/fuse/connections/*/abort', - run.Raw(';'), 'then', - 'echo', - '1', - run.Raw('>'), - run.Raw('/sys/fs/fuse/connections/*/abort'), - run.Raw(';'), 'fi', - ], - ) - # make sure its unmounted - self.client_remote.run( - args=[ - 'sudo', - 'umount', - '-l', - '-f', - self.mountpoint, - ], - ) - - def umount_wait(self, force=False): - """ - :param force: Complete even if the MDS is offline - """ - self.umount() - if force: - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - self.cleanup() - - def cleanup(self): - """ - Remove the mount point. - - Prerequisite: the client is not mounted. - """ - self.client_remote.run( - args=[ - 'rmdir', - '--', - self.mountpoint, - ], - ) - - def kill(self): - """ - Terminate the client without removing the mount point. - """ - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - - def kill_cleanup(self): - """ - Follow up ``kill`` to get to a clean unmounted state. - """ - self.umount() - self.cleanup() - - def teardown(self): - """ - Whatever the state of the mount, get it gone. - """ - super(FuseMount, self).teardown() - - self.umount() - if not self.fuse_daemon.finished: - self.fuse_daemon.stdin.close() - try: - self.fuse_daemon.wait() - except CommandFailedError: - pass - - # Indiscriminate, unlike the touchier cleanup() - self.client_remote.run( - args=[ - 'rm', - '-rf', - self.mountpoint, - ], - ) - - # FIXME: bad naming scheme to call this client_id and also have the - # 'client_id' attr which is something completely different. This - # is what a MonSession calls global_id. - def get_client_id(self): - """ - Look up the CephFS client ID for this mount - """ - - pyscript = """ -import glob -import re -import os -import subprocess - -def find_socket(client_name): - files = glob.glob("/var/run/ceph/ceph-{{client_name}}.*.asok".format(client_name=client_name)) - for f in files: - pid = re.match(".*\.(\d+)\.asok$", f).group(1) - if os.path.exists("/proc/{{0}}".format(pid)): - return f - raise RuntimeError("Client socket {{0}} not found".format(client_name)) - -print find_socket("{client_name}") -""".format(client_name="client.{0}".format(self.client_id)) - - # Find the admin socket - p = self.client_remote.run(args=[ - 'python', '-c', pyscript - ], stdout=StringIO()) - asok_path = p.stdout.getvalue().strip() - log.info("Found client admin socket at {0}".format(asok_path)) - - # Query client ID from admin socket - p = self.client_remote.run( - args=['sudo', 'ceph', '--admin-daemon', asok_path, 'mds_sessions'], - stdout=StringIO()) - return json.loads(p.stdout.getvalue())['id'] diff --git a/teuthology/task/cephfs/kernel_mount.py b/teuthology/task/cephfs/kernel_mount.py deleted file mode 100644 index c319084697..0000000000 --- a/teuthology/task/cephfs/kernel_mount.py +++ /dev/null @@ -1,98 +0,0 @@ -import logging -import os - -from teuthology.orchestra import run -from teuthology.task.cephfs.mount import CephFSMount - -log = logging.getLogger(__name__) - - -class KernelMount(CephFSMount): - def __init__(self, mons, test_dir, client_id, client_remote): - super(KernelMount, self).__init__(test_dir, client_id, client_remote) - self.mons = mons - - def write_secret_file(self, remote, role, keyring, filename): - """ - Stash the keyring in the filename specified. - """ - remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - 'ceph-authtool', - '--name={role}'.format(role=role), - '--print-key', - keyring, - run.Raw('>'), - filename, - ], - ) - - def mount(self): - log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( - id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) - - keyring = '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id) - secret = '{tdir}/data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id) - self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id), - keyring, secret) - - self.client_remote.run( - args=[ - 'mkdir', - '--', - self.mountpoint, - ], - ) - - self.client_remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=self.test_dir), - '/sbin/mount.ceph', - '{mons}:/'.format(mons=','.join(self.mons)), - self.mountpoint, - '-v', - '-o', - 'name={id},secretfile={secret}'.format(id=self.client_id, - secret=secret), - ], - ) - - def umount(self): - log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) - mnt = os.path.join(self.test_dir, 'mnt.{id}'.format(id=self.client_id)) - self.client_remote.run( - args=[ - 'sudo', - 'umount', - mnt, - ], - ) - self.client_remote.run( - args=[ - 'rmdir', - '--', - mnt, - ], - ) - - def cleanup(self): - pass - - def umount_wait(self): - pass - - def is_mounted(self): - return True - - def wait_until_mounted(self): - pass - - def teardown(self): - super(KernelMount, self).teardown() - self.umount() diff --git a/teuthology/task/cephfs/mount.py b/teuthology/task/cephfs/mount.py deleted file mode 100644 index 42b943db37..0000000000 --- a/teuthology/task/cephfs/mount.py +++ /dev/null @@ -1,158 +0,0 @@ -from contextlib import contextmanager -import logging -import datetime -from textwrap import dedent -import os -from teuthology.orchestra import run -from teuthology.orchestra.run import CommandFailedError - -log = logging.getLogger(__name__) - - -class CephFSMount(object): - def __init__(self, test_dir, client_id, client_remote): - """ - :param test_dir: Global teuthology test dir - :param client_id: Client ID, the 'foo' in client.foo - :param client_remote: Remote instance for the host where client will run - """ - - self.test_dir = test_dir - self.client_id = client_id - self.client_remote = client_remote - - self.mountpoint = os.path.join(self.test_dir, 'mnt.{id}'.format(id=self.client_id)) - self.test_files = ['a', 'b', 'c'] - - self.background_procs = [] - - def is_mounted(self): - raise NotImplementedError() - - def mount(self): - raise NotImplementedError() - - def umount(self): - raise NotImplementedError() - - def umount_wait(self): - raise NotImplementedError() - - def kill_cleanup(self): - raise NotImplementedError() - - def kill(self): - raise NotImplementedError() - - def cleanup(self): - raise NotImplementedError() - - def wait_until_mounted(self): - raise NotImplementedError() - - @contextmanager - def mounted(self): - """ - A context manager, from an initially unmounted state, to mount - this, yield, and then unmount and clean up. - """ - self.mount() - self.wait_until_mounted() - try: - yield - finally: - self.umount_wait() - - def create_files(self): - assert(self.is_mounted()) - - for suffix in self.test_files: - log.info("Creating file {0}".format(suffix)) - self.client_remote.run(args=[ - 'sudo', 'touch', os.path.join(self.mountpoint, suffix) - ]) - - def check_files(self): - assert(self.is_mounted()) - - for suffix in self.test_files: - log.info("Checking file {0}".format(suffix)) - r = self.client_remote.run(args=[ - 'sudo', 'ls', os.path.join(self.mountpoint, suffix) - ], check_status=False) - if r.exitstatus != 0: - raise RuntimeError("Expected file {0} not found".format(suffix)) - - def create_destroy(self): - assert(self.is_mounted()) - - filename = "{0} {1}".format(datetime.datetime.now(), self.client_id) - log.debug("Creating test file {0}".format(filename)) - self.client_remote.run(args=[ - 'sudo', 'touch', os.path.join(self.mountpoint, filename) - ]) - log.debug("Deleting test file {0}".format(filename)) - self.client_remote.run(args=[ - 'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename) - ]) - - def _run_python(self, pyscript): - return self.client_remote.run(args=[ - 'sudo', 'daemon-helper', 'kill', 'python', '-c', pyscript - ], wait=False, stdin=run.PIPE) - - def open_background(self, basename="background_file"): - """ - Open a file for writing, then block such that the client - will hold a capability - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - pyscript = dedent(""" - import time - - f = open("{path}", 'w') - f.write('content') - f.flush() - f.write('content2') - while True: - time.sleep(1) - """).format(path=path) - - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - return rproc - - def write_background(self, basename="background_file"): - """ - Open a file for writing, complete as soon as you can - :param basename: - :return: - """ - assert(self.is_mounted()) - - path = os.path.join(self.mountpoint, basename) - - pyscript = dedent(""" - import time - - f = open("{path}", 'w') - f.write('content') - f.close() - """).format(path=path) - - rproc = self._run_python(pyscript) - self.background_procs.append(rproc) - return rproc - - def teardown(self): - for p in self.background_procs: - log.info("Terminating background process") - if p.stdin: - p.stdin.close() - try: - p.wait() - except CommandFailedError: - pass diff --git a/teuthology/task/chef.py b/teuthology/task/chef.py deleted file mode 100644 index db793c3939..0000000000 --- a/teuthology/task/chef.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Chef-solo task -""" -import logging - -from ..orchestra import run -from .. import misc - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run chef-solo on all nodes. - """ - log.info('Running chef-solo...') - - run.wait( - ctx.cluster.run( - args=[ - 'wget', -# '-q', - '-O-', -# 'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch', - 'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD', - run.Raw('|'), - 'sh', - '-x', - ], - wait=False, - ) - ) - - log.info('Reconnecting after ceph-qa-chef run') - misc.reconnect(ctx, 10) #Reconnect for ulimit and other ceph-qa-chef changes - diff --git a/teuthology/task/cifs_mount.py b/teuthology/task/cifs_mount.py deleted file mode 100644 index ac58f31cc0..0000000000 --- a/teuthology/task/cifs_mount.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Mount cifs clients. Unmount when finished. -""" -import contextlib -import logging -import os - -from teuthology import misc as teuthology -from ..orchestra import run - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a cifs client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. - - Example that starts smbd and mounts cifs on all nodes:: - - tasks: - - ceph: - - samba: - - cifs-mount: - - interactive: - - Example that splits smbd and cifs: - - tasks: - - ceph: - - samba: [samba.0] - - cifs-mount: [client.0] - - ceph-fuse: [client.1] - - interactive: - - Example that specifies the share name: - - tasks: - - ceph: - - ceph-fuse: - - samba: - samba.0: - cephfuse: "{testdir}/mnt.0" - - cifs-mount: - client.0: - share: cephfuse - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting cifs clients...') - - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys())) - - from teuthology.task.samba import get_sambas - samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')] - sambas = list(get_sambas(ctx=ctx, roles=samba_roles)) - (ip, _) = sambas[0][1].ssh.get_transport().getpeername() - log.info('samba ip: {ip}'.format(ip=ip)) - - for id_, remote in clients: - mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_)) - log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format( - id=id_, remote=remote,mnt=mnt)) - - remote.run( - args=[ - 'mkdir', - '--', - mnt, - ], - ) - - rolestr = 'client.{id_}'.format(id_=id_) - unc = "ceph" - log.info("config: {c}".format(c=config)) - if config[rolestr] is not None and 'share' in config[rolestr]: - unc = config[rolestr]['share'] - - remote.run( - args=[ - 'sudo', - 'mount', - '-t', - 'cifs', - '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc), - '-o', - 'username=ubuntu,password=ubuntu', - mnt, - ], - ) - - remote.run( - args=[ - 'sudo', - 'chown', - 'ubuntu:ubuntu', - '{m}/'.format(m=mnt), - ], - ) - - try: - yield - finally: - log.info('Unmounting cifs clients...') - for id_, remote in clients: - remote.run( - args=[ - 'sudo', - 'umount', - mnt, - ], - ) - for id_, remote in clients: - while True: - try: - remote.run( - args=[ - 'rmdir', '--', mnt, - run.Raw('2>&1'), - run.Raw('|'), - 'grep', 'Device or resource busy', - ], - ) - import time - time.sleep(1) - except Exception: - break diff --git a/teuthology/task/cram.py b/teuthology/task/cram.py deleted file mode 100644 index 05824d26ab..0000000000 --- a/teuthology/task/cram.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Cram tests -""" -import logging -import os - -from teuthology import misc as teuthology -from teuthology.parallel import parallel -from ..orchestra import run - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run all cram tests from the specified urls on the specified - clients. Each client runs tests in parallel. - - Limitations: - Tests must have a .t suffix. Tests with duplicate names will - overwrite each other, so only the last one will run. - - For example:: - - tasks: - - ceph: - - cram: - clients: - client.0: - - http://ceph.com/qa/test.t - - http://ceph.com/qa/test2.t] - client.1: [http://ceph.com/qa/test.t] - - You can also run a list of cram tests on all clients:: - - tasks: - - ceph: - - cram: - clients: - all: [http://ceph.com/qa/test.t] - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict) - assert 'clients' in config and isinstance(config['clients'], dict), \ - 'configuration must contain a dictionary of clients' - - clients = teuthology.replace_all_with_clients(ctx.cluster, - config['clients']) - testdir = teuthology.get_testdir(ctx) - - try: - for client, tests in clients.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client) - remote.run( - args=[ - 'mkdir', '--', client_dir, - run.Raw('&&'), - 'virtualenv', '{tdir}/virtualenv'.format(tdir=testdir), - run.Raw('&&'), - '{tdir}/virtualenv/bin/pip'.format(tdir=testdir), - 'install', 'cram', - ], - ) - for test in tests: - log.info('fetching test %s for %s', test, client) - assert test.endswith('.t'), 'tests must end in .t' - remote.run( - args=[ - 'wget', '-nc', '-nv', '-P', client_dir, '--', test, - ], - ) - - with parallel() as p: - for role in clients.iterkeys(): - p.spawn(_run_tests, ctx, role) - finally: - for client, tests in clients.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client) - test_files = set([test.rsplit('/', 1)[1] for test in tests]) - - # remove test files unless they failed - for test_file in test_files: - abs_file = os.path.join(client_dir, test_file) - remote.run( - args=[ - 'test', '-f', abs_file + '.err', - run.Raw('||'), - 'rm', '-f', '--', abs_file, - ], - ) - - # ignore failure since more than one client may - # be run on a host, and the client dir should be - # non-empty if the test failed - remote.run( - args=[ - 'rm', '-rf', '--', - '{tdir}/virtualenv'.format(tdir=testdir), - run.Raw(';'), - 'rmdir', '--ignore-fail-on-non-empty', client_dir, - ], - ) - -def _run_tests(ctx, role): - """ - For each role, check to make sure it's a client, then run the cram on that client - - :param ctx: Context - :param role: Roles - """ - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - ceph_ref = ctx.summary.get('ceph-sha1', 'master') - - testdir = teuthology.get_testdir(ctx) - log.info('Running tests for %s...', role) - remote.run( - args=[ - run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)), - run.Raw('CEPH_ID="{id}"'.format(id=id_)), - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '{tdir}/virtualenv/bin/cram'.format(tdir=testdir), - '-v', '--', - run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)), - ], - logger=log.getChild(role), - ) diff --git a/teuthology/task/devstack.py b/teuthology/task/devstack.py deleted file mode 100644 index c676acea17..0000000000 --- a/teuthology/task/devstack.py +++ /dev/null @@ -1,382 +0,0 @@ -#!/usr/bin/env python -import contextlib -import logging -from cStringIO import StringIO -import textwrap -from configparser import ConfigParser -import time - -from ..orchestra import run -from .. import misc -from ..contextutil import nested - -log = logging.getLogger(__name__) - -DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git' -DS_STABLE_BRANCHES = ("havana", "grizzly") - -is_devstack_node = lambda role: role.startswith('devstack') -is_osd_node = lambda role: role.startswith('osd') - - -@contextlib.contextmanager -def task(ctx, config): - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - with nested(lambda: install(ctx=ctx, config=config), - lambda: smoke(ctx=ctx, config=config), - ): - yield - - -@contextlib.contextmanager -def install(ctx, config): - """ - Install OpenStack DevStack and configure it to use a Ceph cluster for - Glance and Cinder. - - Requires one node with a role 'devstack' - - Since devstack runs rampant on the system it's used on, typically you will - want to reprovision that machine after using devstack on it. - - Also, the default 2GB of RAM that is given to vps nodes is insufficient. I - recommend 4GB. Downburst can be instructed to give 4GB to a vps node by - adding this to the yaml: - - downburst: - ram: 4G - - This was created using documentation found here: - https://github.com/openstack-dev/devstack/blob/master/README.md - http://ceph.com/docs/master/rbd/rbd-openstack/ - """ - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0] - - devstack_branch = config.get("branch", "master") - install_devstack(devstack_node, devstack_branch) - try: - configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node) - yield - finally: - pass - - -def install_devstack(devstack_node, branch="master"): - log.info("Cloning DevStack repo...") - - args = ['git', 'clone', DEVSTACK_GIT_REPO] - devstack_node.run(args=args) - - if branch != "master": - if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"): - branch = "stable/" + branch - log.info("Checking out {branch} branch...".format(branch=branch)) - cmd = "cd devstack && git checkout " + branch - devstack_node.run(args=cmd) - - log.info("Installing DevStack...") - args = ['cd', 'devstack', run.Raw('&&'), './stack.sh'] - devstack_node.run(args=args) - - -def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node): - pool_size = config.get('pool_size', '128') - create_pools(ceph_node, pool_size) - distribute_ceph_conf(devstack_node, ceph_node) - # This is where we would install python-ceph and ceph-common but it appears - # the ceph task does that for us. - generate_ceph_keys(ceph_node) - distribute_ceph_keys(devstack_node, ceph_node) - secret_uuid = set_libvirt_secret(devstack_node, ceph_node) - update_devstack_config_files(devstack_node, secret_uuid) - set_apache_servername(devstack_node) - # Rebooting is the most-often-used method of restarting devstack services - misc.reboot(devstack_node) - start_devstack(devstack_node) - restart_apache(devstack_node) - - -def create_pools(ceph_node, pool_size): - log.info("Creating pools on Ceph cluster...") - - for pool_name in ['volumes', 'images', 'backups']: - args = ['ceph', 'osd', 'pool', 'create', pool_name, pool_size] - ceph_node.run(args=args) - - -def distribute_ceph_conf(devstack_node, ceph_node): - log.info("Copying ceph.conf to DevStack node...") - - ceph_conf_path = '/etc/ceph/ceph.conf' - ceph_conf = misc.get_file(ceph_node, ceph_conf_path, sudo=True) - misc.sudo_write_file(devstack_node, ceph_conf_path, ceph_conf) - - -def generate_ceph_keys(ceph_node): - log.info("Generating Ceph keys...") - - ceph_auth_cmds = [ - ['ceph', 'auth', 'get-or-create', 'client.cinder', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'], # noqa - ['ceph', 'auth', 'get-or-create', 'client.glance', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'], # noqa - ['ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon', - 'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'], # noqa - ] - for cmd in ceph_auth_cmds: - ceph_node.run(args=cmd) - - -def distribute_ceph_keys(devstack_node, ceph_node): - log.info("Copying Ceph keys to DevStack node...") - - def copy_key(from_remote, key_name, to_remote, dest_path, owner): - key_stringio = StringIO() - from_remote.run( - args=['ceph', 'auth', 'get-or-create', key_name], - stdout=key_stringio) - key_stringio.seek(0) - misc.sudo_write_file(to_remote, dest_path, - key_stringio, owner=owner) - keys = [ - dict(name='client.glance', - path='/etc/ceph/ceph.client.glance.keyring', - # devstack appears to just want root:root - #owner='glance:glance', - ), - dict(name='client.cinder', - path='/etc/ceph/ceph.client.cinder.keyring', - # devstack appears to just want root:root - #owner='cinder:cinder', - ), - dict(name='client.cinder-backup', - path='/etc/ceph/ceph.client.cinder-backup.keyring', - # devstack appears to just want root:root - #owner='cinder:cinder', - ), - ] - for key_dict in keys: - copy_key(ceph_node, key_dict['name'], devstack_node, - key_dict['path'], key_dict.get('owner')) - - -def set_libvirt_secret(devstack_node, ceph_node): - log.info("Setting libvirt secret...") - - cinder_key_stringio = StringIO() - ceph_node.run(args=['ceph', 'auth', 'get-key', 'client.cinder'], - stdout=cinder_key_stringio) - cinder_key = cinder_key_stringio.getvalue().strip() - - uuid_stringio = StringIO() - devstack_node.run(args=['uuidgen'], stdout=uuid_stringio) - uuid = uuid_stringio.getvalue().strip() - - secret_path = '/tmp/secret.xml' - secret_template = textwrap.dedent(""" - - {uuid} - - client.cinder secret - - """) - misc.sudo_write_file(devstack_node, secret_path, - secret_template.format(uuid=uuid)) - devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file', - secret_path]) - devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret', - uuid, '--base64', cinder_key]) - return uuid - - -def update_devstack_config_files(devstack_node, secret_uuid): - log.info("Updating DevStack config files to use Ceph...") - - def backup_config(node, file_name, backup_ext='.orig.teuth'): - node.run(args=['cp', '-f', file_name, file_name + backup_ext]) - - def update_config(config_name, config_stream, update_dict, - section='DEFAULT'): - parser = ConfigParser() - parser.read_file(config_stream) - for (key, value) in update_dict.items(): - parser.set(section, key, value) - out_stream = StringIO() - parser.write(out_stream) - out_stream.seek(0) - return out_stream - - updates = [ - dict(name='/etc/glance/glance-api.conf', options=dict( - default_store='rbd', - rbd_store_user='glance', - rbd_store_pool='images', - show_image_direct_url='True',)), - dict(name='/etc/cinder/cinder.conf', options=dict( - volume_driver='cinder.volume.drivers.rbd.RBDDriver', - rbd_pool='volumes', - rbd_ceph_conf='/etc/ceph/ceph.conf', - rbd_flatten_volume_from_snapshot='false', - rbd_max_clone_depth='5', - glance_api_version='2', - rbd_user='cinder', - rbd_secret_uuid=secret_uuid, - backup_driver='cinder.backup.drivers.ceph', - backup_ceph_conf='/etc/ceph/ceph.conf', - backup_ceph_user='cinder-backup', - backup_ceph_chunk_size='134217728', - backup_ceph_pool='backups', - backup_ceph_stripe_unit='0', - backup_ceph_stripe_count='0', - restore_discard_excess_bytes='true', - )), - dict(name='/etc/nova/nova.conf', options=dict( - libvirt_images_type='rbd', - libvirt_images_rbd_pool='volumes', - libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf', - rbd_user='cinder', - rbd_secret_uuid=secret_uuid, - libvirt_inject_password='false', - libvirt_inject_key='false', - libvirt_inject_partition='-2', - )), - ] - - for update in updates: - file_name = update['name'] - options = update['options'] - config_str = misc.get_file(devstack_node, file_name, sudo=True) - config_stream = StringIO(config_str) - backup_config(devstack_node, file_name) - new_config_stream = update_config(file_name, config_stream, options) - misc.sudo_write_file(devstack_node, file_name, new_config_stream) - - -def set_apache_servername(node): - # Apache complains: "Could not reliably determine the server's fully - # qualified domain name, using 127.0.0.1 for ServerName" - # So, let's make sure it knows its name. - log.info("Setting Apache ServerName...") - - hostname = node.hostname - config_file = '/etc/apache2/conf.d/servername' - misc.sudo_write_file(node, config_file, - "ServerName {name}".format(name=hostname)) - - -def start_devstack(devstack_node): - log.info("Patching devstack start script...") - # This causes screen to start headless - otherwise rejoin-stack.sh fails - # because there is no terminal attached. - cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh" - devstack_node.run(args=cmd) - - log.info("Starting devstack...") - cmd = "cd devstack && ./rejoin-stack.sh" - devstack_node.run(args=cmd) - - # This was added because I was getting timeouts on Cinder requests - which - # were trying to access Keystone on port 5000. A more robust way to handle - # this would be to introduce a wait-loop on devstack_node that checks to - # see if a service is listening on port 5000. - log.info("Waiting 30s for devstack to start...") - time.sleep(30) - - -def restart_apache(node): - node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True) - - -@contextlib.contextmanager -def exercise(ctx, config): - log.info("Running devstack exercises...") - - if config is None: - config = {} - if not isinstance(config, dict): - raise TypeError("config must be a dict") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - - # TODO: save the log *and* preserve failures - #devstack_archive_dir = create_devstack_archive(ctx, devstack_node) - - try: - #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format( # noqa - # dir=devstack_archive_dir) - cmd = "cd devstack && ./exercise.sh" - devstack_node.run(args=cmd, wait=True) - yield - finally: - pass - - -def create_devstack_archive(ctx, devstack_node): - test_dir = misc.get_testdir(ctx) - devstack_archive_dir = "{test_dir}/archive/devstack".format( - test_dir=test_dir) - devstack_node.run(args="mkdir -p " + devstack_archive_dir) - return devstack_archive_dir - - -@contextlib.contextmanager -def smoke(ctx, config): - log.info("Running a basic smoketest...") - - devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0] - an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0] - - try: - create_volume(devstack_node, an_osd_node, 'smoke0', 1) - yield - finally: - pass - - -def create_volume(devstack_node, ceph_node, vol_name, size): - """ - :param size: The size of the volume, in GB - """ - size = str(size) - log.info("Creating a {size}GB volume named {name}...".format( - name=vol_name, - size=size)) - args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create', - '--display-name', vol_name, size] - out_stream = StringIO() - devstack_node.run(args=args, stdout=out_stream, wait=True) - vol_info = parse_os_table(out_stream.getvalue()) - log.debug("Volume info: %s", str(vol_info)) - - out_stream = StringIO() - try: - ceph_node.run(args="rbd --id cinder ls -l volumes", stdout=out_stream, - wait=True) - except run.CommandFailedError: - log.debug("Original rbd call failed; retrying without '--id cinder'") - ceph_node.run(args="rbd ls -l volumes", stdout=out_stream, - wait=True) - - assert vol_info['id'] in out_stream.getvalue(), \ - "Volume not found on Ceph cluster" - assert vol_info['size'] == size, \ - "Volume size on Ceph cluster is different than specified" - return vol_info['id'] - - -def parse_os_table(table_str): - out_dict = dict() - for line in table_str.split('\n'): - if line.startswith('|'): - items = line.split() - out_dict[items[1]] = items[3] - return out_dict diff --git a/teuthology/task/die_on_err.py b/teuthology/task/die_on_err.py deleted file mode 100644 index 1dfd370736..0000000000 --- a/teuthology/task/die_on_err.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Raise exceptions on osd coredumps or test err directories -""" -import contextlib -import logging -import time -from ..orchestra import run - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Die if {testdir}/err exists or if an OSD dumps core - """ - if config is None: - config = {} - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - testdir = teuthology.get_testdir(ctx) - - while True: - for i in range(num_osds): - (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys() - p = osd_remote.run( - args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ], - wait=True, - check_status=False, - ) - exit_status = p.exitstatus - - if exit_status == 0: - log.info("osd %d has an error" % i) - raise Exception("osd %d error" % i) - - log_path = '/var/log/ceph/osd.%d.log' % (i) - - p = osd_remote.run( - args = [ - 'tail', '-1', log_path, - run.Raw('|'), - 'grep', '-q', 'end dump' - ], - wait=True, - check_status=False, - ) - exit_status = p.exitstatus - - if exit_status == 0: - log.info("osd %d dumped core" % i) - raise Exception("osd %d dumped core" % i) - - time.sleep(5) diff --git a/teuthology/task/divergent_priors.py b/teuthology/task/divergent_priors.py deleted file mode 100644 index 432614f233..0000000000 --- a/teuthology/task/divergent_priors.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Special case divergence test -""" -import logging -import time - -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of divergent entries with prior_version - prior to log_tail - - config: none - - Requires 3 osds. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'divergent_priors task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - ctx.manager = manager - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - - # create 1 pg pool - log.info('creating foo') - manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1') - - osds = [0, 1, 2] - for i in osds: - manager.set_config(i, osd_min_pg_log_entries=1) - - # determine primary - divergent = manager.get_pg_primary('foo', 0) - log.info("primary and soon to be divergent is %d", divergent) - non_divergent = [0,1,2] - non_divergent.remove(divergent) - - log.info('writing initial objects') - # write 1000 objects - for i in range(1000): - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) - - manager.wait_for_clean() - - # blackhole non_divergent - log.info("blackholing osds %s", str(non_divergent)) - for i in non_divergent: - manager.set_config(i, filestore_blackhole='') - - # write 1 (divergent) object - log.info('writing divergent object existing_0') - rados( - ctx, mon, ['-p', 'foo', 'put', 'existing_0', dummyfile2], - wait=False) - time.sleep(10) - mon.run( - args=['killall', '-9', 'rados'], - wait=True, - check_status=False) - - # kill all the osds - log.info('killing all the osds') - for i in osds: - manager.kill_osd(i) - for i in osds: - manager.mark_down_osd(i) - for i in osds: - manager.mark_out_osd(i) - - # bring up non-divergent - log.info("bringing up non_divergent %s", str(non_divergent)) - for i in non_divergent: - manager.revive_osd(i) - for i in non_divergent: - manager.mark_in_osd(i) - - log.info('making log long to prevent backfill') - for i in non_divergent: - manager.set_config(i, osd_min_pg_log_entries=100000) - - # write 1 non-divergent object (ensure that old divergent one is divergent) - log.info('writing non-divergent object existing_1') - rados(ctx, mon, ['-p', 'foo', 'put', 'existing_1', dummyfile2]) - - manager.wait_for_recovery() - - # ensure no recovery - log.info('delay recovery') - for i in non_divergent: - manager.set_config(i, osd_recovery_delay_start=100000) - - # bring in our divergent friend - log.info("revive divergent %d", divergent) - manager.revive_osd(divergent) - - while len(manager.get_osd_status()['up']) < 3: - time.sleep(10) - - log.info('delay recovery divergent') - manager.set_config(divergent, osd_recovery_delay_start=100000) - log.info('mark divergent in') - manager.mark_in_osd(divergent) - - log.info('wait for peering') - rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile]) - - log.info("killing divergent %d", divergent) - manager.kill_osd(divergent) - log.info("reviving divergent %d", divergent) - manager.revive_osd(divergent) - - log.info('allowing recovery') - for i in non_divergent: - manager.set_config(i, osd_recovery_delay_start=0) - - log.info('reading existing_0') - exit_status = rados(ctx, mon, - ['-p', 'foo', 'get', 'existing_0', - '-o', '/tmp/existing']) - assert exit_status is 0 - log.info("success") diff --git a/teuthology/task/dump_stuck.py b/teuthology/task/dump_stuck.py deleted file mode 100644 index 9e1780f015..0000000000 --- a/teuthology/task/dump_stuck.py +++ /dev/null @@ -1,146 +0,0 @@ -""" -Dump_stuck command -""" -import logging -import re -import time - -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): - """ - Do checks. Make sure get_stuck_pgs return the right amout of information, then - extract health information from the raw_cluster_cmd and compare the results with - values passed in. This passes if all asserts pass. - - :param num_manager: Ceph manager - :param num_inactive: number of inaactive pages that are stuck - :param num_unclean: number of unclean pages that are stuck - :paran num_stale: number of stale pages that are stuck - :param timeout: timeout value for get_stuck_pgs calls - """ - inactive = manager.get_stuck_pgs('inactive', timeout) - assert len(inactive) == num_inactive - unclean = manager.get_stuck_pgs('unclean', timeout) - assert len(unclean) == num_unclean - stale = manager.get_stuck_pgs('stale', timeout) - assert len(stale) == num_stale - - # check health output as well - health = manager.raw_cluster_cmd('health') - log.debug('ceph health is: %s', health) - if num_inactive > 0: - m = re.search('(\d+) pgs stuck inactive', health) - assert int(m.group(1)) == num_inactive - if num_unclean > 0: - m = re.search('(\d+) pgs stuck unclean', health) - assert int(m.group(1)) == num_unclean - if num_stale > 0: - m = re.search('(\d+) pgs stuck stale', health) - assert int(m.group(1)) == num_stale - -def task(ctx, config): - """ - Test the dump_stuck command. - - :param ctx: Context - :param config: Configuration - """ - assert config is None, \ - 'dump_stuck requires no configuration' - assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ - 'dump_stuck requires exactly 2 osds' - - timeout = 60 - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_clean(timeout) - - manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', -# '--mon-osd-report-timeout 90', - '--mon-pg-stuck-threshold 10') - - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) - num_pgs = manager.get_num_pgs() - - manager.mark_out_osd(0) - time.sleep(timeout) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_recovery(timeout) - - check_stuck( - manager, - num_inactive=0, - num_unclean=num_pgs, - num_stale=0, - ) - - manager.mark_in_osd(0) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_clean(timeout) - - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) - - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): - manager.kill_osd(id_) - manager.mark_down_osd(id_) - - starttime = time.time() - done = False - while not done: - try: - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=num_pgs, - ) - done = True - except AssertionError: - # wait up to 15 minutes to become stale - if time.time() - starttime > 900: - raise - - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): - manager.revive_osd(id_) - manager.mark_in_osd(id_) - while True: - try: - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - break - except Exception: - log.exception('osds must not be started yet, waiting...') - time.sleep(1) - manager.wait_for_clean(timeout) - - check_stuck( - manager, - num_inactive=0, - num_unclean=0, - num_stale=0, - ) diff --git a/teuthology/task/ec_lost_unfound.py b/teuthology/task/ec_lost_unfound.py deleted file mode 100644 index 6c155abd10..0000000000 --- a/teuthology/task/ec_lost_unfound.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Lost_unfound -""" -import logging -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects on an ec pool. - - A pretty rigid cluster is brought up andtested by this task - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_for_clean() - - profile = config.get('erasure_code_profile', { - 'k': '2', - 'm': '2', - 'ruleset-failure-domain': 'osd' - }) - profile_name = profile.get('name', 'lost_unfound') - manager.create_erasure_code_profile(profile_name, profile) - pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name) - - # something that is always there - dummyfile = '/etc/fstab' - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - manager.kill_osd(3) - manager.mark_down_osd(3) - - for f in range(1, 10): - rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) - - # take out osd.1 and a necessary shard of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - manager.revive_osd(0) - manager.wait_till_osd_is_up(0) - manager.revive_osd(3) - manager.wait_till_osd_is_up(3) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - - log.info("reverting unfound in %s", pg['pgid']) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'delete') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_for_recovery() - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) - assert err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() diff --git a/teuthology/task/filestore_idempotent.py b/teuthology/task/filestore_idempotent.py deleted file mode 100644 index 1689a3764b..0000000000 --- a/teuthology/task/filestore_idempotent.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Filestore/filejournal handler -""" -import logging -from ..orchestra import run -import random - -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test filestore/filejournal handling of non-idempotent events. - - Currently this is a kludge; we require the ceph task preceeds us just - so that we get the tarball installed to run the test binary. - - :param ctx: Context - :param config: Configuration - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - # just use the first client... - client = clients[0]; - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - - testdir = teuthology.get_testdir(ctx) - - dir = '%s/data/test.%s' % (testdir, client) - - seed = str(int(random.uniform(1,100))) - - try: - log.info('creating a working dir') - remote.run(args=['mkdir', dir]) - remote.run( - args=[ - 'cd', dir, - run.Raw('&&'), - 'wget','-q', '-Orun_seed_to.sh', - 'http://ceph.com/git/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD', - run.Raw('&&'), - 'wget','-q', '-Orun_seed_to_range.sh', - 'http://ceph.com/git/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD', - run.Raw('&&'), - 'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh', - ]); - - log.info('running a series of tests') - proc = remote.run( - args=[ - 'cd', dir, - run.Raw('&&'), - './run_seed_to_range.sh', seed, '50', '300', - ], - wait=False, - check_status=False) - result = proc.wait() - - if result != 0: - remote.run( - args=[ - 'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir), - ]) - raise Exception("./run_seed_to_range.sh errored out") - - finally: - remote.run(args=[ - 'rm', '-rf', '--', dir - ]) - diff --git a/teuthology/task/kclient.py b/teuthology/task/kclient.py deleted file mode 100644 index 96f421f9ab..0000000000 --- a/teuthology/task/kclient.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Mount/unmount a ``kernel`` client. -""" -import contextlib -import logging - -from teuthology import misc -from teuthology.task.cephfs.kernel_mount import KernelMount - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Mount/unmount a ``kernel`` client. - - The config is optional and defaults to mounting on all clients. If - a config is given, it is expected to be a list of clients to do - this operation on. This lets you e.g. set up one client with - ``ceph-fuse`` and another with ``kclient``. - - Example that mounts all clients:: - - tasks: - - ceph: - - kclient: - - interactive: - - Example that uses both ``kclient` and ``ceph-fuse``:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - kclient: [client.1] - - interactive: - - :param ctx: Context - :param config: Configuration - """ - log.info('Mounting kernel clients...') - assert config is None or isinstance(config, list), \ - "task kclient got invalid config" - - if config is None: - config = ['client.{id}'.format(id=id_) - for id_ in misc.all_roles_of_type(ctx.cluster, 'client')] - clients = list(misc.get_clients(ctx=ctx, roles=config)) - - test_dir = misc.get_testdir(ctx) - - # Assemble mon addresses - remotes_and_roles = ctx.cluster.remotes.items() - roles = [roles for (remote_, roles) in remotes_and_roles] - ips = [remote_.ssh.get_transport().getpeername()[0] - for (remote_, _) in remotes_and_roles] - mons = misc.get_mons(roles, ips).values() - - mounts = {} - for id_, remote in clients: - kernel_mount = KernelMount(mons, test_dir, id_, remote) - mounts[id_] = kernel_mount - - kernel_mount.mount() - - ctx.mounts = mounts - try: - yield mounts - finally: - log.info('Unmounting kernel clients...') - for mount in mounts.values(): - mount.umount() diff --git a/teuthology/task/locktest.py b/teuthology/task/locktest.py deleted file mode 100755 index d896e18ffa..0000000000 --- a/teuthology/task/locktest.py +++ /dev/null @@ -1,134 +0,0 @@ -""" -locktests -""" -import logging - -from ..orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Run locktests, from the xfstests suite, on the given - clients. Whether the clients are ceph-fuse or kernel does not - matter, and the two clients can refer to the same mount. - - The config is a list of two clients to run the locktest on. The - first client will be the host. - - For example: - tasks: - - ceph: - - ceph-fuse: [client.0, client.1] - - locktest: - [client.0, client.1] - - This task does not yield; there would be little point. - - :param ctx: Context - :param config: Configuration - """ - - assert isinstance(config, list) - log.info('fetching and building locktests...') - (host,) = ctx.cluster.only(config[0]).remotes - (client,) = ctx.cluster.only(config[1]).remotes - ( _, _, host_id) = config[0].partition('.') - ( _, _, client_id) = config[1].partition('.') - testdir = teuthology.get_testdir(ctx) - hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id) - clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id) - - try: - for client_name in config: - log.info('building on {client_}'.format(client_=client_name)) - ctx.cluster.only(client_name).run( - args=[ - # explicitly does not support multiple autotest tasks - # in a single run; the result archival would conflict - 'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'mkdir', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'wget', - '-nv', - 'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c', - '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - '-o', '{tdir}/locktest/locktest'.format(tdir=testdir) - ], - logger=log.getChild('locktest_client.{id}'.format(id=client_name)), - ) - - log.info('built locktest on each client') - - host.run(args=['sudo', 'touch', - '{mnt}/locktestfile'.format(mnt=hostmnt), - run.Raw('&&'), - 'sudo', 'chown', 'ubuntu.ubuntu', - '{mnt}/locktestfile'.format(mnt=hostmnt) - ] - ) - - log.info('starting on host') - hostproc = host.run( - args=[ - '{tdir}/locktest/locktest'.format(tdir=testdir), - '-p', '6788', - '-d', - '{mnt}/locktestfile'.format(mnt=hostmnt), - ], - wait=False, - logger=log.getChild('locktest.host'), - ) - log.info('starting on client') - (_,_,hostaddr) = host.name.partition('@') - clientproc = client.run( - args=[ - '{tdir}/locktest/locktest'.format(tdir=testdir), - '-p', '6788', - '-d', - '-h', hostaddr, - '{mnt}/locktestfile'.format(mnt=clientmnt), - ], - logger=log.getChild('locktest.client'), - wait=False - ) - - hostresult = hostproc.wait() - clientresult = clientproc.wait() - if (hostresult != 0) or (clientresult != 0): - raise Exception("Did not pass locking test!") - log.info('finished locktest executable with results {r} and {s}'. \ - format(r=hostresult, s=clientresult)) - - finally: - log.info('cleaning up host dir') - host.run( - args=[ - 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rmdir', '{tdir}/locktest' - ], - logger=log.getChild('.{id}'.format(id=config[0])), - ) - log.info('cleaning up client dir') - client.run( - args=[ - 'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir), - run.Raw('&&'), - 'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir), - run.Raw('&&'), - 'rmdir', '{tdir}/locktest'.format(tdir=testdir) - ], - logger=log.getChild('.{id}'.format(\ - id=config[1])), - ) diff --git a/teuthology/task/lost_unfound.py b/teuthology/task/lost_unfound.py deleted file mode 100644 index 674fe123ce..0000000000 --- a/teuthology/task/lost_unfound.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -Lost_unfound -""" -import logging -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects. - - A pretty rigid cluseter is brought up andtested by this task - """ - POOL = 'unfound_pool' - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - manager.create_pool(POOL) - - # something that is always there - dummyfile = '/etc/fstab' - - # take an osd out until the very end - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.mark_out_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - - # bring osd.0 back up, let it peer, but don't replicate the new - # objects... - log.info('osd.0 command_args is %s' % 'foo') - log.info(ctx.daemons.get_daemon('osd', 0).command_args) - ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ - '--osd-recovery-delay-start', '1000' - ]) - manager.revive_osd(0) - manager.mark_in_osd(0) - manager.wait_till_osd_is_up(0) - - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.wait_till_active() - - # take out osd.1 and the only copy of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.mark_out_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - - # bring up osd.2 so that things would otherwise, in theory, recovery fully - manager.revive_osd(2) - manager.mark_in_osd(2) - manager.wait_till_osd_is_up(2) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - primary = 'osd.%d' % pg['acting'][0] - - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - #log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - num_unfound=0 - for o in m['objects']: - if len(o['locations']) == 0: - num_unfound += 1 - assert m['num_unfound'] == num_unfound - - log.info("reverting unfound in %s on %s", pg['pgid'], primary) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'revert') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_recovery() - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) - assert not err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.mark_in_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() diff --git a/teuthology/task/manypools.py b/teuthology/task/manypools.py deleted file mode 100644 index 32b9d562bf..0000000000 --- a/teuthology/task/manypools.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Force pg creation on all osds -""" -from teuthology import misc as teuthology -from ..orchestra import run -import logging - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Create the specified number of pools and write 16 objects to them (thereby forcing - the PG creation on each OSD). This task creates pools from all the clients, - in parallel. It is easy to add other daemon types which have the appropriate - permissions, but I don't think anything else does. - The config is just the number of pools to create. I recommend setting - "mon create pg interval" to a very low value in your ceph config to speed - this up. - - You probably want to do this to look at memory consumption, and - maybe to test how performance changes with the number of PGs. For example: - - tasks: - - ceph: - config: - mon: - mon create pg interval: 1 - - manypools: 3000 - - radosbench: - clients: [client.0] - time: 360 - """ - - log.info('creating {n} pools'.format(n=config)) - - poolnum = int(config) - creator_remotes = [] - client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client') - log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles)) - for role in client_roles: - log.info('role={role_}'.format(role_=role)) - (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.iterkeys() - creator_remotes.append((creator_remote, 'client.{id}'.format(id=role))) - - remaining_pools = poolnum - poolprocs=dict() - while (remaining_pools > 0): - log.info('{n} pools remaining to create'.format(n=remaining_pools)) - for remote, role_ in creator_remotes: - poolnum = remaining_pools - remaining_pools -= 1 - if remaining_pools < 0: - continue - log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_)) - proc = remote.run( - args=[ - 'rados', - '--name', role_, - 'mkpool', 'pool{num}'.format(num=poolnum), '-1', - run.Raw('&&'), - 'rados', - '--name', role_, - '--pool', 'pool{num}'.format(num=poolnum), - 'bench', '0', 'write', '-t', '16', '--block-size', '1' - ], - wait = False - ) - log.info('waiting for pool and object creates') - poolprocs[remote] = proc - - run.wait(poolprocs.itervalues()) - - log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum)) diff --git a/teuthology/task/mds_client_recovery.py b/teuthology/task/mds_client_recovery.py deleted file mode 100644 index 903a70a5b0..0000000000 --- a/teuthology/task/mds_client_recovery.py +++ /dev/null @@ -1,352 +0,0 @@ - -""" -Teuthology task for exercising CephFS client recovery -""" - -import contextlib -import logging -import time -import unittest - -from teuthology import misc -from teuthology.orchestra.run import CommandFailedError -from teuthology.task import interactive -from teuthology.task.cephfs.filesystem import Filesystem -from teuthology.task.ceph_fuse import get_client_configs, FuseMount - - -log = logging.getLogger(__name__) - - -# Arbitrary timeouts for operations involving restarting -# an MDS or waiting for it to come up -MDS_RESTART_GRACE = 60 - - -class TestClientRecovery(unittest.TestCase): - # Environment references - fs = None - mount_a = None - mount_b = None - mds_session_timeout = None - mds_reconnect_timeout = None - - def setUp(self): - self.fs.mds_restart() - self.mount_a.mount() - self.mount_b.mount() - self.mount_a.wait_until_mounted() - self.mount_a.wait_until_mounted() - - def tearDown(self): - self.mount_a.teardown() - self.mount_b.teardown() - # mount_a.umount() - # mount_b.umount() - # run.wait([mount_a.fuse_daemon, mount_b.fuse_daemon], timeout=600) - # mount_a.cleanup() - # mount_b.cleanup() - - def test_basic(self): - # Check that two clients come up healthy and see each others' files - # ===================================================== - self.mount_a.create_files() - self.mount_a.check_files() - self.mount_a.umount_wait() - - self.mount_b.check_files() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - # Check that the admin socket interface is correctly reporting - # two sessions - # ===================================================== - ls_data = self._session_list() - self.assert_session_count(2, ls_data) - - self.assertSetEqual( - set([l['id'] for l in ls_data]), - {self.mount_a.get_client_id(), self.mount_b.get_client_id()} - ) - - def test_restart(self): - # Check that after an MDS restart both clients reconnect and continue - # to handle I/O - # ===================================================== - self.fs.mds_stop() - self.fs.mds_fail() - self.fs.mds_restart() - self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - - self.mount_a.create_destroy() - self.mount_b.create_destroy() - - def assert_session_count(self, expected, ls_data=None): - if ls_data is None: - ls_data = self.fs.mds_asok(['session', 'ls']) - - self.assertEqual(expected, len(ls_data), "Expected {0} sessions, found {1}".format( - expected, len(ls_data) - )) - - def _session_list(self): - ls_data = self.fs.mds_asok(['session', 'ls']) - ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] - return ls_data - - def _session_by_id(self, session_ls): - return dict([(s['id'], s) for s in session_ls]) - - def test_reconnect_timeout(self): - # Reconnect timeout - # ================= - # Check that if I stop an MDS and a client goes away, the MDS waits - # for the reconnect period - self.fs.mds_stop() - self.fs.mds_fail() - - mount_a_client_id = self.mount_a.get_client_id() - self.mount_a.umount_wait(force=True) - - self.fs.mds_restart() - - self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) - - ls_data = self._session_list() - self.assert_session_count(2, ls_data) - - # The session for the dead client should have the 'reconnect' flag set - self.assertTrue(self._session_by_id(ls_data)[mount_a_client_id]['reconnecting']) - - # Wait for the reconnect state to clear, this should take the - # reconnect timeout period. - in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) - # Check that the period we waited to enter active is within a factor - # of two of the reconnect timeout. - self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2, - "Should have been in reconnect phase for {0} but only took {1}".format( - self.mds_reconnect_timeout, in_reconnect_for - )) - - self.assert_session_count(1) - - # Check that the client that timed out during reconnect can - # mount again and do I/O - self.mount_a.mount() - self.mount_a.wait_until_mounted() - self.mount_a.create_destroy() - - self.assert_session_count(2) - - def test_reconnect_eviction(self): - # Eviction during reconnect - # ========================= - self.fs.mds_stop() - self.fs.mds_fail() - - mount_a_client_id = self.mount_a.get_client_id() - self.mount_a.umount_wait(force=True) - - self.fs.mds_restart() - - # Enter reconnect phase - self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) - self.assert_session_count(2) - - # Evict the stuck client - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - self.assert_session_count(1) - - # Observe that we proceed to active phase without waiting full reconnect timeout - evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) - # Once we evict the troublemaker, the reconnect phase should complete - # in well under the reconnect timeout. - self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, - "reconnect did not complete soon enough after eviction, took {0}".format( - evict_til_active - )) - - # Bring the client back - self.mount_a.mount() - self.mount_a.create_destroy() - - def test_stale_caps(self): - # Capability release from stale session - # ===================================== - cap_holder = self.mount_a.open_background() - self.mount_a.kill() - - # Now, after mds_session_timeout seconds, the waiter should - # complete their operation when the MDS marks the holder's - # session stale. - cap_waiter = self.mount_b.write_background() - a = time.time() - cap_waiter.wait() - b = time.time() - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, - "Capability handover took {0}, expected approx {1}".format( - cap_waited, self.mds_session_timeout - )) - - cap_holder.stdin.close() - try: - cap_holder.wait() - except CommandFailedError: - # We killed it, so it raises an error - pass - - self.mount_a.kill_cleanup() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - def test_evicted_caps(self): - # Eviction while holding a capability - # =================================== - - # Take out a write capability on a file on client A, - # and then immediately kill it. - cap_holder = self.mount_a.open_background() - mount_a_client_id = self.mount_a.get_client_id() - self.mount_a.kill() - - # The waiter should get stuck waiting for the capability - # held on the MDS by the now-dead client A - cap_waiter = self.mount_b.write_background() - time.sleep(5) - self.assertFalse(cap_waiter.finished) - - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - # Now, because I evicted the old holder of the capability, it should - # immediately get handed over to the waiter - a = time.time() - cap_waiter.wait() - b = time.time() - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - # This is the check that it happened 'now' rather than waiting - # for the session timeout - self.assertLess(cap_waited, self.mds_session_timeout / 2.0, - "Capability handover took {0}, expected less than {1}".format( - cap_waited, self.mds_session_timeout / 2.0 - )) - - cap_holder.stdin.close() - try: - cap_holder.wait() - except CommandFailedError: - # We killed it, so it raises an error - pass - - self.mount_a.kill_cleanup() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - - -class LogStream(object): - def __init__(self): - self.buffer = "" - - def write(self, data): - self.buffer += data - if "\n" in self.buffer: - lines = self.buffer.split("\n") - for line in lines[:-1]: - log.info(line) - self.buffer = lines[-1] - - def flush(self): - pass - - -class InteractiveFailureResult(unittest.TextTestResult): - """ - Specialization that implements interactive-on-error style - behavior. - """ - ctx = None - - def addFailure(self, test, err): - log.error(self._exc_info_to_string(err, test)) - log.error("Failure in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=self.ctx, config=None) - - def addError(self, test, err): - log.error(self._exc_info_to_string(err, test)) - log.error("Error in test '{0}', going interactive".format( - self.getDescription(test) - )) - interactive.task(ctx=self.ctx, config=None) - - -@contextlib.contextmanager -def task(ctx, config): - fs = Filesystem(ctx, config) - - # Pick out the clients we will use from the configuration - # ======================================================= - client_list = list(misc.all_roles_of_type(ctx.cluster, 'client')) - if len(client_list) < 2: - raise RuntimeError("Need at least two clients") - - client_a_id = client_list[0] - client_a_role = "client.{0}".format(client_a_id) - client_a_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(client_a_id)]))[0][1] - - client_b_id = client_list[1] - client_b_role = "client.{0}".format(client_b_id) - client_b_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(client_a_id)]))[0][1] - - test_dir = misc.get_testdir(ctx) - - # TODO: enable switching FUSE to kclient here - # or perhaps just use external client tasks and consume ctx.mounts here? - client_configs = get_client_configs(ctx, config) - mount_a = FuseMount(client_configs.get(client_a_role, {}), test_dir, client_a_id, client_a_remote) - mount_b = FuseMount(client_configs.get(client_b_role, {}), test_dir, client_b_id, client_b_remote) - - # Attach environment references to test case - # ========================================== - TestClientRecovery.mds_reconnect_timeout = int(fs.mds_asok( - ['config', 'get', 'mds_reconnect_timeout'] - )['mds_reconnect_timeout']) - TestClientRecovery.mds_session_timeout = int(fs.mds_asok( - ['config', 'get', 'mds_session_timeout'] - )['mds_session_timeout']) - TestClientRecovery.fs = fs - TestClientRecovery.mount_a = mount_a - TestClientRecovery.mount_b = mount_b - - # Stash references on ctx so that we can easily debug in interactive mode - # ======================================================================= - ctx.filesystem = fs - ctx.mount_a = mount_a - ctx.mount_b = mount_b - - # Execute test suite - # ================== - suite = unittest.TestLoader().loadTestsFromTestCase(TestClientRecovery) - if ctx.config.get("interactive-on-error", False): - InteractiveFailureResult.ctx = ctx - result_class = InteractiveFailureResult - else: - result_class = unittest.TextTestResult - result = unittest.TextTestRunner( - stream=LogStream(), - resultclass=result_class, - verbosity=2, - failfast=True).run(suite) - - if not result.wasSuccessful(): - result.printErrors() # duplicate output at end for convenience - raise RuntimeError("Test failure.") - - # Continue to any downstream tasks - # ================================ - yield diff --git a/teuthology/task/mds_creation_failure.py b/teuthology/task/mds_creation_failure.py deleted file mode 100644 index d1de156944..0000000000 --- a/teuthology/task/mds_creation_failure.py +++ /dev/null @@ -1,85 +0,0 @@ - -import logging -import contextlib -import time -import ceph_manager -from teuthology import misc -from teuthology.orchestra.run import CommandFailedError, Raw - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Go through filesystem creation with a synthetic failure in an MDS - in its 'up:creating' state, to exercise the retry behaviour. - """ - # Grab handles to the teuthology objects of interest - mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) - if len(mdslist) != 1: - # Require exactly one MDS, the code path for creation failure when - # a standby is available is different - raise RuntimeError("This task requires exactly one MDS") - - mds_id = mdslist[0] - (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys() - manager = ceph_manager.CephManager( - mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), - ) - - # Stop MDS - manager.raw_cluster_cmd('mds', 'set', "max_mds", "0") - mds = ctx.daemons.get_daemon('mds', mds_id) - mds.stop() - manager.raw_cluster_cmd('mds', 'fail', mds_id) - - # Reset the filesystem so that next start will go into CREATING - manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") - manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") - - # Start the MDS with mds_kill_create_at set, it will crash during creation - mds.restart_with_args(["--mds_kill_create_at=1"]) - try: - mds.wait_for_exit() - except CommandFailedError as e: - if e.exitstatus == 1: - log.info("MDS creation killed as expected") - else: - log.error("Unexpected status code %s" % e.exitstatus) - raise - - # Since I have intentionally caused a crash, I will clean up the resulting core - # file to avoid task.internal.coredump seeing it as a failure. - log.info("Removing core file from synthetic MDS failure") - mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))]) - - # It should have left the MDS map state still in CREATING - status = manager.get_mds_status(mds_id) - assert status['state'] == 'up:creating' - - # Start the MDS again without the kill flag set, it should proceed with creation successfully - mds.restart() - - # Wait for state ACTIVE - t = 0 - create_timeout = 120 - while True: - status = manager.get_mds_status(mds_id) - if status['state'] == 'up:active': - log.info("MDS creation completed successfully") - break - elif status['state'] == 'up:creating': - log.info("MDS still in creating state") - if t > create_timeout: - log.error("Creating did not complete within %ss" % create_timeout) - raise RuntimeError("Creating did not complete within %ss" % create_timeout) - t += 1 - time.sleep(1) - else: - log.error("Unexpected MDS state: %s" % status['state']) - assert(status['state'] in ['up:active', 'up:creating']) - - # The system should be back up in a happy healthy state, go ahead and run any further tasks - # inside this context. - yield diff --git a/teuthology/task/mds_journal_migration.py b/teuthology/task/mds_journal_migration.py deleted file mode 100644 index 0d2bf09cf5..0000000000 --- a/teuthology/task/mds_journal_migration.py +++ /dev/null @@ -1,106 +0,0 @@ - -import contextlib -import logging -from teuthology import misc - -from teuthology.task.ceph import write_conf -from teuthology.task.cephfs.filesystem import Filesystem - -log = logging.getLogger(__name__) - - -JOURNAL_FORMAT_LEGACY = 0 -JOURNAL_FORMAT_RESILIENT = 1 - - -@contextlib.contextmanager -def task(ctx, config): - """ - Given a Ceph cluster has already been set up, exercise the migration - of the CephFS journal from an older format to the latest format. On - successful completion the filesystem will be running with a journal - in the new format. - - Optionally specify which client to use like this: - - - mds-journal_migration: - client: client.0 - - """ - if not hasattr(ctx, 'ceph'): - raise RuntimeError("This task must be nested in 'ceph' task") - - if not hasattr(ctx, 'mounts'): - raise RuntimeError("This task must be nested inside 'kclient' or 'ceph_fuse' task") - - # Determine which client we will use - if config and 'client' in config: - # Use client specified in config - client_role = config['client'] - client_list = list(misc.get_clients(ctx, [client_role])) - try: - client_id = client_list[0][0] - except IndexError: - raise RuntimeError("Client role '{0}' not found".format(client_role)) - else: - # Pick one arbitrary client to use - client_list = list(misc.all_roles_of_type(ctx.cluster, 'client')) - try: - client_id = client_list[0] - except IndexError: - raise RuntimeError("This task requires at least one client") - - fs = Filesystem(ctx, config) - ctx.fs = fs - old_journal_version = JOURNAL_FORMAT_LEGACY - new_journal_version = JOURNAL_FORMAT_RESILIENT - - # Set config so that journal will be created in older format - if 'mds' not in ctx.ceph.conf: - ctx.ceph.conf['mds'] = {} - ctx.ceph.conf['mds']['mds journal format'] = old_journal_version - write_conf(ctx) # XXX because we don't have the ceph task's config object, if they - # used a different config path this won't work. - - # Create a filesystem using the older journal format. - for mount in ctx.mounts.values(): - mount.umount_wait() - fs.mds_stop() - fs.reset() - fs.mds_restart() - - # Do some client work so that the log is populated with something. - mount = ctx.mounts[client_id] - with mount.mounted(): - mount.create_files() - mount.check_files() # sanity, this should always pass - - # Modify the ceph.conf to ask the MDS to use the new journal format. - ctx.ceph.conf['mds']['mds journal format'] = new_journal_version - write_conf(ctx) - - # Restart the MDS. - fs.mds_fail_restart() - fs.wait_for_daemons() - - # This ensures that all daemons come up into a valid state - fs.wait_for_daemons() - - # Check that files created in the initial client workload are still visible - # in a client mount. - with mount.mounted(): - mount.check_files() - - # Verify that the journal really has been rewritten. - journal_version = fs.get_journal_version() - if journal_version != new_journal_version: - raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format( - new_journal_version, journal_version() - )) - - # Leave all MDSs and clients running for any child tasks - for mount in ctx.mounts.values(): - mount.mount() - mount.wait_until_mounted() - - yield diff --git a/teuthology/task/mds_thrash.py b/teuthology/task/mds_thrash.py deleted file mode 100644 index c60b741a49..0000000000 --- a/teuthology/task/mds_thrash.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -Thrash mds by simulating failures -""" -import logging -import contextlib -import ceph_manager -import random -import time -from gevent.greenlet import Greenlet -from gevent.event import Event -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - - -class MDSThrasher(Greenlet): - """ - MDSThrasher:: - - The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc). - - The config is optional. Many of the config parameters are a a maximum value - to use when selecting a random value from a range. To always use the maximum - value, set no_random to true. The config is a dict containing some or all of: - - seed: [no default] seed the random number generator - - randomize: [default: true] enables randomization and use the max/min values - - max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at - any given time. - - max_thrash_delay: [default: 30] maximum number of seconds to delay before - thrashing again. - - max_revive_delay: [default: 10] maximum number of seconds to delay before - bringing back a thrashed MDS - - thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed - during replay. Value should be between 0.0 and 1.0 - - max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in - the replay state before thrashing - - thrash_weights: allows specific MDSs to be thrashed more/less frequently. This option - overrides anything specified by max_thrash. This option is a dict containing - mds.x: weight pairs. For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0]. Each weight - is a value from 0.0 to 1.0. Any MDSs not specified will be automatically - given a weight of 0.0. For a given MDS, by default the trasher delays for up - to max_thrash_delay, trashes, waits for the MDS to recover, and iterates. If a non-zero - weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash - during that iteration based on a random value [0-1] not exceeding the weight of that MDS. - - Examples:: - - - The following example sets the likelihood that mds.a will be thrashed - to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the - likelihood that an MDS will be thrashed in replay to 40%. - Thrash weights do not have to sum to 1. - - tasks: - - ceph: - - mds_thrash: - thrash_weights: - - mds.a: 0.8 - - mds.b: 0.2 - thrash_in_replay: 0.4 - - ceph-fuse: - - workunit: - clients: - all: [suites/fsx.sh] - - The following example disables randomization, and uses the max delay values: - - tasks: - - ceph: - - mds_thrash: - max_thrash_delay: 10 - max_revive_delay: 1 - max_replay_thrash_delay: 4 - - """ - - def __init__(self, ctx, manager, config, logger, failure_group, weight): - super(MDSThrasher, self).__init__() - - self.ctx = ctx - self.manager = manager - assert self.manager.is_clean() - - self.stopping = Event() - self.logger = logger - self.config = config - - self.randomize = bool(self.config.get('randomize', True)) - self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0)) - self.thrash_in_replay = float(self.config.get('thrash_in_replay', False)) - assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format( - v=self.thrash_in_replay) - - self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0)) - - self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) - - self.failure_group = failure_group - self.weight = weight - - def _run(self): - try: - self.do_thrash() - except: - # Log exceptions here so we get the full backtrace (it's lost - # by the time someone does a .get() on this greenlet) - self.logger.exception("Exception in do_thrash:") - raise - - def log(self, x): - """Write data to logger assigned to this MDThrasher""" - self.logger.info(x) - - def stop(self): - self.stopping.set() - - def do_thrash(self): - """ - Perform the random thrashing action - """ - self.log('starting mds_do_thrash for failure group: ' + ', '.join( - ['mds.{_id}'.format(_id=_f) for _f in self.failure_group])) - while not self.stopping.is_set(): - delay = self.max_thrash_delay - if self.randomize: - delay = random.randrange(0.0, self.max_thrash_delay) - - if delay > 0.0: - self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) - self.stopping.wait(delay) - if self.stopping.is_set(): - continue - - skip = random.randrange(0.0, 1.0) - if self.weight < 1.0 and skip > self.weight: - self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, - weight=self.weight)) - continue - - # find the active mds in the failure group - statuses = [self.manager.get_mds_status(m) for m in self.failure_group] - actives = filter(lambda s: s and s['state'] == 'up:active', statuses) - assert len(actives) == 1, 'Can only have one active in a failure group' - - active_mds = actives[0]['name'] - active_rank = actives[0]['rank'] - - self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank)) - self.manager.kill_mds_by_rank(active_rank) - - # wait for mon to report killed mds as crashed - last_laggy_since = None - itercount = 0 - while True: - failed = self.manager.get_mds_status_all()['failed'] - status = self.manager.get_mds_status(active_mds) - if not status: - break - if 'laggy_since' in status: - last_laggy_since = status['laggy_since'] - break - if any([(f == active_mds) for f in failed]): - break - self.log( - 'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format( - _id=active_mds)) - itercount = itercount + 1 - if itercount > 10: - self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) - time.sleep(2) - if last_laggy_since: - self.log( - 'mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since)) - else: - self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since)) - - # wait for a standby mds to takeover and become active - takeover_mds = None - takeover_rank = None - itercount = 0 - while True: - statuses = [self.manager.get_mds_status(m) for m in self.failure_group] - actives = filter(lambda s: s and s['state'] == 'up:active', statuses) - if len(actives) > 0: - assert len(actives) == 1, 'Can only have one active in failure group' - takeover_mds = actives[0]['name'] - takeover_rank = actives[0]['rank'] - break - itercount = itercount + 1 - if itercount > 10: - self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) - - self.log('New active mds is mds.{_id}'.format(_id=takeover_mds)) - - # wait for a while before restarting old active to become new - # standby - delay = self.max_revive_delay - if self.randomize: - delay = random.randrange(0.0, self.max_revive_delay) - - self.log('waiting for {delay} secs before reviving mds.{id}'.format( - delay=delay, id=active_mds)) - time.sleep(delay) - - self.log('reviving mds.{id}'.format(id=active_mds)) - self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank) - - status = {} - while True: - status = self.manager.get_mds_status(active_mds) - if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'): - break - self.log( - 'waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds)) - time.sleep(2) - self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state'])) - - # don't do replay thrashing right now - continue - # this might race with replay -> active transition... - if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay: - - delay = self.max_replay_thrash_delay - if self.randomize: - delay = random.randrange(0.0, self.max_replay_thrash_delay) - time.sleep(delay) - self.log('kill replaying mds.{id}'.format(id=self.to_kill)) - self.manager.kill_mds(self.to_kill) - - delay = self.max_revive_delay - if self.randomize: - delay = random.randrange(0.0, self.max_revive_delay) - - self.log('waiting for {delay} secs before reviving mds.{id}'.format( - delay=delay, id=self.to_kill)) - time.sleep(delay) - - self.log('revive mds.{id}'.format(id=self.to_kill)) - self.manager.revive_mds(self.to_kill) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Stress test the mds by thrashing while another task/workunit - is running. - - Please refer to MDSThrasher class for further information on the - available options. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mds_thrash task only accepts a dict for configuration' - mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) - assert len(mdslist) > 1, \ - 'mds_thrash task requires at least 2 metadata servers' - - # choose random seed - seed = None - if 'seed' in config: - seed = int(config['seed']) - else: - seed = int(time.time()) - log.info('mds thrasher using random seed: {seed}'.format(seed=seed)) - random.seed(seed) - - max_thrashers = config.get('max_thrash', 1) - thrashers = {} - - (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys() - manager = ceph_manager.CephManager( - first, ctx=ctx, logger=log.getChild('ceph_manager'), - ) - - # make sure everyone is in active, standby, or standby-replay - log.info('Wait for all MDSs to reach steady state...') - statuses = None - statuses_by_rank = None - while True: - statuses = {m: manager.get_mds_status(m) for m in mdslist} - statuses_by_rank = {} - for _, s in statuses.iteritems(): - if isinstance(s, dict): - statuses_by_rank[s['rank']] = s - - ready = filter(lambda (_, s): s is not None and (s['state'] == 'up:active' - or s['state'] == 'up:standby' - or s['state'] == 'up:standby-replay'), - statuses.items()) - if len(ready) == len(statuses): - break - time.sleep(2) - log.info('Ready to start thrashing') - - # setup failure groups - failure_groups = {} - actives = {s['name']: s for (_, s) in statuses.iteritems() if s['state'] == 'up:active'} - log.info('Actives is: {d}'.format(d=actives)) - log.info('Statuses is: {d}'.format(d=statuses_by_rank)) - for active in actives: - for (r, s) in statuses.iteritems(): - if s['standby_for_name'] == active: - if not active in failure_groups: - failure_groups[active] = [] - log.info('Assigning mds rank {r} to failure group {g}'.format(r=r, g=active)) - failure_groups[active].append(r) - - manager.wait_for_clean() - for (active, standbys) in failure_groups.iteritems(): - weight = 1.0 - if 'thrash_weights' in config: - weight = int(config['thrash_weights'].get('mds.{_id}'.format(_id=active), '0.0')) - - failure_group = [active] - failure_group.extend(standbys) - - thrasher = MDSThrasher( - ctx, manager, config, - logger=log.getChild('mds_thrasher.failure_group.[{a}, {sbs}]'.format( - a=active, - sbs=', '.join(standbys) - ) - ), - failure_group=failure_group, - weight=weight) - thrasher.start() - thrashers[active] = thrasher - - # if thrash_weights isn't specified and we've reached max_thrash, - # we're done - if not 'thrash_weights' in config and len(thrashers) == max_thrashers: - break - - try: - log.debug('Yielding') - yield - finally: - log.info('joining mds_thrashers') - for t in thrashers: - log.info('join thrasher for failure group [{fg}]'.format(fg=', '.join(failure_group))) - thrashers[t].stop() - thrashers[t].join() - log.info('done joining') diff --git a/teuthology/task/metadata.yaml b/teuthology/task/metadata.yaml deleted file mode 100644 index ccdc3b077c..0000000000 --- a/teuthology/task/metadata.yaml +++ /dev/null @@ -1,2 +0,0 @@ -instance-id: test -local-hostname: test diff --git a/teuthology/task/mon_clock_skew_check.py b/teuthology/task/mon_clock_skew_check.py deleted file mode 100644 index 891e6ec484..0000000000 --- a/teuthology/task/mon_clock_skew_check.py +++ /dev/null @@ -1,261 +0,0 @@ -""" -Handle clock skews in monitors. -""" -import logging -import contextlib -import ceph_manager -import time -import gevent -from StringIO import StringIO -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -class ClockSkewCheck: - """ - Periodically check if there are any clock skews among the monitors in the - quorum. By default, assume no skews are supposed to exist; that can be - changed using the 'expect-skew' option. If 'fail-on-skew' is set to false, - then we will always succeed and only report skews if any are found. - - This class does not spawn a thread. It assumes that, if that is indeed - wanted, it should be done by a third party (for instance, the task using - this class). We intend it as such in order to reuse this class if need be. - - This task accepts the following options: - - interval amount of seconds to wait in-between checks. (default: 30.0) - max-skew maximum skew, in seconds, that is considered tolerable before - issuing a warning. (default: 0.05) - expect-skew 'true' or 'false', to indicate whether to expect a skew during - the run or not. If 'true', the test will fail if no skew is - found, and succeed if a skew is indeed found; if 'false', it's - the other way around. (default: false) - never-fail Don't fail the run if a skew is detected and we weren't - expecting it, or if no skew is detected and we were expecting - it. (default: False) - - at-least-once Runs at least once, even if we are told to stop. - (default: True) - at-least-once-timeout If we were told to stop but we are attempting to - run at least once, timeout after this many seconds. - (default: 600) - - Example: - Expect a skew higher than 0.05 seconds, but only report it without - failing the teuthology run. - - - mon_clock_skew_check: - interval: 30 - max-skew: 0.05 - expect_skew: true - never-fail: true - """ - - def __init__(self, ctx, manager, config, logger): - self.ctx = ctx - self.manager = manager - - self.stopping = False - self.logger = logger - self.config = config - - if self.config is None: - self.config = dict() - - self.check_interval = float(self.config.get('interval', 30.0)) - - first_mon = teuthology.get_first_mon(ctx, config) - remote = ctx.cluster.only(first_mon).remotes.keys()[0] - proc = remote.run( - args=[ - 'sudo', - 'ceph-mon', - '-i', first_mon[4:], - '--show-config-value', 'mon_clock_drift_allowed' - ], stdout=StringIO(), wait=True - ) - self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue())) - - self.expect_skew = self.config.get('expect-skew', False) - self.never_fail = self.config.get('never-fail', False) - self.at_least_once = self.config.get('at-least-once', True) - self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0) - - def info(self, x): - """ - locally define logger for info messages - """ - self.logger.info(x) - - def warn(self, x): - """ - locally define logger for warnings - """ - self.logger.warn(x) - - def debug(self, x): - """ - locally define logger for debug messages - """ - self.logger.info(x) - self.logger.debug(x) - - def finish(self): - """ - Break out of the do_check loop. - """ - self.stopping = True - - def sleep_interval(self): - """ - If a sleep interval is set, sleep for that amount of time. - """ - if self.check_interval > 0.0: - self.debug('sleeping for {s} seconds'.format( - s=self.check_interval)) - time.sleep(self.check_interval) - - def print_skews(self, skews): - """ - Display skew values. - """ - total = len(skews) - if total > 0: - self.info('---------- found {n} skews ----------'.format(n=total)) - for mon_id, values in skews.iteritems(): - self.info('mon.{id}: {v}'.format(id=mon_id, v=values)) - self.info('-------------------------------------') - else: - self.info('---------- no skews were found ----------') - - def do_check(self): - """ - Clock skew checker. Loops until finish() is called. - """ - self.info('start checking for clock skews') - skews = dict() - ran_once = False - - started_on = None - - while not self.stopping or (self.at_least_once and not ran_once): - - if self.at_least_once and not ran_once and self.stopping: - if started_on is None: - self.info('kicking-off timeout (if any)') - started_on = time.time() - elif self.at_least_once_timeout > 0.0: - assert time.time() - started_on < self.at_least_once_timeout, \ - 'failed to obtain a timecheck before timeout expired' - - quorum_size = len(teuthology.get_mon_names(self.ctx)) - self.manager.wait_for_mon_quorum_size(quorum_size) - - health = self.manager.get_mon_health(True) - timechecks = health['timechecks'] - - clean_check = False - - if timechecks['round_status'] == 'finished': - assert (timechecks['round'] % 2) == 0, \ - 'timecheck marked as finished but round ' \ - 'disagrees (r {r})'.format( - r=timechecks['round']) - clean_check = True - else: - assert timechecks['round_status'] == 'on-going', \ - 'timecheck status expected \'on-going\' ' \ - 'but found \'{s}\' instead'.format( - s=timechecks['round_status']) - if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1: - self.info('round still on-going, but there are available reports') - else: - self.info('no timechecks available just yet') - self.sleep_interval() - continue - - assert len(timechecks['mons']) > 1, \ - 'there are not enough reported timechecks; ' \ - 'expected > 1 found {n}'.format(n=len(timechecks['mons'])) - - for check in timechecks['mons']: - mon_skew = float(check['skew']) - mon_health = check['health'] - mon_id = check['name'] - if abs(mon_skew) > self.max_skew: - assert mon_health == 'HEALTH_WARN', \ - 'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format( - id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew) - - log_str = 'mon.{id} with skew {s} > max {ms}'.format( - id=mon_id,s=abs(mon_skew),ms=self.max_skew) - - """ add to skew list """ - details = check['details'] - skews[mon_id] = {'skew': mon_skew, 'details': details} - - if self.expect_skew: - self.info('expected skew: {str}'.format(str=log_str)) - else: - self.warn('unexpected skew: {str}'.format(str=log_str)) - - if clean_check or (self.expect_skew and len(skews) > 0): - ran_once = True - self.print_skews(skews) - self.sleep_interval() - - total = len(skews) - self.print_skews(skews) - - error_str = '' - found_error = False - - if self.expect_skew: - if total == 0: - error_str = 'We were expecting a skew, but none was found!' - found_error = True - else: - if total > 0: - error_str = 'We were not expecting a skew, but we did find it!' - found_error = True - - if found_error: - self.info(error_str) - if not self.never_fail: - assert False, error_str - -@contextlib.contextmanager -def task(ctx, config): - """ - Use clas ClockSkewCheck to check for clock skews on the monitors. - This task will spawn a thread running ClockSkewCheck's do_check(). - - All the configuration will be directly handled by ClockSkewCheck, - so please refer to the class documentation for further information. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mon_clock_skew_check task only accepts a dict for configuration' - log.info('Beginning mon_clock_skew_check...') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - skew_check = ClockSkewCheck(ctx, - manager, config, - logger=log.getChild('mon_clock_skew_check')) - skew_check_thread = gevent.spawn(skew_check.do_check) - try: - yield - finally: - log.info('joining mon_clock_skew_check') - skew_check.finish() - skew_check_thread.get() - - diff --git a/teuthology/task/mon_recovery.py b/teuthology/task/mon_recovery.py deleted file mode 100644 index bfa2cdf78f..0000000000 --- a/teuthology/task/mon_recovery.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Monitor recovery -""" -import logging -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test monitor recovery. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)] - log.info("mon ids = %s" % mons) - - manager.wait_for_mon_quorum_size(len(mons)) - - log.info('verifying all monitors are in the quorum') - for m in mons: - s = manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - log.info('restarting each monitor in turn') - for m in mons: - # stop a monitor - manager.kill_mon(m) - manager.wait_for_mon_quorum_size(len(mons) - 1) - - # restart - manager.revive_mon(m) - manager.wait_for_mon_quorum_size(len(mons)) - - # in forward and reverse order, - rmons = mons - rmons.reverse() - for mons in mons, rmons: - log.info('stopping all monitors') - for m in mons: - manager.kill_mon(m) - - log.info('forming a minimal quorum for %s, then adding monitors' % mons) - qnum = (len(mons) / 2) + 1 - num = 0 - for m in mons: - manager.revive_mon(m) - num += 1 - if num >= qnum: - manager.wait_for_mon_quorum_size(num) - - # on both leader and non-leader ranks... - for rank in [0, 1]: - # take one out - log.info('removing mon %s' % mons[rank]) - manager.kill_mon(mons[rank]) - manager.wait_for_mon_quorum_size(len(mons) - 1) - - log.info('causing some monitor log activity') - m = 30 - for n in range(1, m): - manager.raw_cluster_cmd('log', '%d of %d' % (n, m)) - - log.info('adding mon %s back in' % mons[rank]) - manager.revive_mon(mons[rank]) - manager.wait_for_mon_quorum_size(len(mons)) diff --git a/teuthology/task/mon_thrash.py b/teuthology/task/mon_thrash.py deleted file mode 100644 index b45aaa9997..0000000000 --- a/teuthology/task/mon_thrash.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -Monitor thrash -""" -import logging -import contextlib -import ceph_manager -import random -import time -import gevent -import json -import math -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def _get_mons(ctx): - """ - Get monitor names from the context value. - """ - mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)] - return mons - -class MonitorThrasher: - """ - How it works:: - - - pick a monitor - - kill it - - wait for quorum to be formed - - sleep for 'revive_delay' seconds - - revive monitor - - wait for quorum to be formed - - sleep for 'thrash_delay' seconds - - Options:: - - seed Seed to use on the RNG to reproduce a previous - behaviour (default: None; i.e., not set) - revive_delay Number of seconds to wait before reviving - the monitor (default: 10) - thrash_delay Number of seconds to wait in-between - test iterations (default: 0) - thrash_store Thrash monitor store before killing the monitor being thrashed (default: False) - thrash_store_probability Probability of thrashing a monitor's store - (default: 50) - thrash_many Thrash multiple monitors instead of just one. If - 'maintain-quorum' is set to False, then we will - thrash up to as many monitors as there are - available. (default: False) - maintain_quorum Always maintain quorum, taking care on how many - monitors we kill during the thrashing. If we - happen to only have one or two monitors configured, - if this option is set to True, then we won't run - this task as we cannot guarantee maintenance of - quorum. Setting it to false however would allow the - task to run with as many as just one single monitor. - (default: True) - freeze_mon_probability: how often to freeze the mon instead of killing it, - in % (default: 0) - freeze_mon_duration: how many seconds to freeze the mon (default: 15) - scrub Scrub after each iteration (default: True) - - Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also - be set to True. - - For example:: - - tasks: - - ceph: - - mon_thrash: - revive_delay: 20 - thrash_delay: 1 - thrash_store: true - thrash_store_probability: 40 - seed: 31337 - maintain_quorum: true - thrash_many: true - - ceph-fuse: - - workunit: - clients: - all: - - mon/workloadgen.sh - """ - def __init__(self, ctx, manager, config, logger): - self.ctx = ctx - self.manager = manager - self.manager.wait_for_clean() - - self.stopping = False - self.logger = logger - self.config = config - - if self.config is None: - self.config = dict() - - """ Test reproducibility """ - self.random_seed = self.config.get('seed', None) - - if self.random_seed is None: - self.random_seed = int(time.time()) - - self.rng = random.Random() - self.rng.seed(int(self.random_seed)) - - """ Monitor thrashing """ - self.revive_delay = float(self.config.get('revive_delay', 10.0)) - self.thrash_delay = float(self.config.get('thrash_delay', 0.0)) - - self.thrash_many = self.config.get('thrash_many', False) - self.maintain_quorum = self.config.get('maintain_quorum', True) - - self.scrub = self.config.get('scrub', True) - - self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10)) - self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0)) - - assert self.max_killable() > 0, \ - 'Unable to kill at least one monitor with the current config.' - - """ Store thrashing """ - self.store_thrash = self.config.get('store_thrash', False) - self.store_thrash_probability = int( - self.config.get('store_thrash_probability', 50)) - if self.store_thrash: - assert self.store_thrash_probability > 0, \ - 'store_thrash is set, probability must be > 0' - assert self.maintain_quorum, \ - 'store_thrash = true must imply maintain_quorum = true' - - self.thread = gevent.spawn(self.do_thrash) - - def log(self, x): - """ - locally log info messages - """ - self.logger.info(x) - - def do_join(self): - """ - Break out of this processes thrashing loop. - """ - self.stopping = True - self.thread.get() - - def should_thrash_store(self): - """ - If allowed, indicate that we should thrash a certain percentage of - the time as determined by the store_thrash_probability value. - """ - if not self.store_thrash: - return False - return self.rng.randrange(0, 101) < self.store_thrash_probability - - def thrash_store(self, mon): - """ - Thrash the monitor specified. - :param mon: monitor to thrash - """ - addr = self.ctx.ceph.conf['mon.%s' % mon]['mon addr'] - self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr)) - out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force') - j = json.loads(out) - assert j['ret'] == 0, \ - 'error forcing store sync on mon.{id}:\n{ret}'.format( - id=mon,ret=out) - - def should_freeze_mon(self): - """ - Indicate that we should freeze a certain percentago of the time - as determined by the freeze_mon_probability value. - """ - return self.rng.randrange(0, 101) < self.freeze_mon_probability - - def freeze_mon(self, mon): - """ - Send STOP signal to freeze the monitor. - """ - log.info('Sending STOP to mon %s', mon) - self.manager.signal_mon(mon, 19) # STOP - - def unfreeze_mon(self, mon): - """ - Send CONT signal to unfreeze the monitor. - """ - log.info('Sending CONT to mon %s', mon) - self.manager.signal_mon(mon, 18) # CONT - - def kill_mon(self, mon): - """ - Kill the monitor specified - """ - self.log('killing mon.{id}'.format(id=mon)) - self.manager.kill_mon(mon) - - def revive_mon(self, mon): - """ - Revive the monitor specified - """ - self.log('killing mon.{id}'.format(id=mon)) - self.log('reviving mon.{id}'.format(id=mon)) - self.manager.revive_mon(mon) - - def max_killable(self): - """ - Return the maximum number of monitors we can kill. - """ - m = len(_get_mons(self.ctx)) - if self.maintain_quorum: - return max(math.ceil(m/2.0)-1, 0) - else: - return m - - def do_thrash(self): - """ - Cotinuously loop and thrash the monitors. - """ - self.log('start thrashing') - self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\ - 'thrash many: {tm}, maintain quorum: {mq} '\ - 'store thrash: {st}, probability: {stp} '\ - 'freeze mon: prob {fp} duration {fd}'.format( - s=self.random_seed,r=self.revive_delay,t=self.thrash_delay, - tm=self.thrash_many, mq=self.maintain_quorum, - st=self.store_thrash,stp=self.store_thrash_probability, - fp=self.freeze_mon_probability,fd=self.freeze_mon_duration, - )) - - while not self.stopping: - mons = _get_mons(self.ctx) - self.manager.wait_for_mon_quorum_size(len(mons)) - self.log('making sure all monitors are in the quorum') - for m in mons: - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - kill_up_to = self.rng.randrange(1, self.max_killable()+1) - mons_to_kill = self.rng.sample(mons, kill_up_to) - self.log('monitors to thrash: {m}'.format(m=mons_to_kill)) - - mons_to_freeze = [] - for mon in mons: - if mon in mons_to_kill: - continue - if self.should_freeze_mon(): - mons_to_freeze.append(mon) - self.log('monitors to freeze: {m}'.format(m=mons_to_freeze)) - - for mon in mons_to_kill: - self.log('thrashing mon.{m}'.format(m=mon)) - - """ we only thrash stores if we are maintaining quorum """ - if self.should_thrash_store() and self.maintain_quorum: - self.thrash_store(mon) - - self.kill_mon(mon) - - if mons_to_freeze: - for mon in mons_to_freeze: - self.freeze_mon(mon) - self.log('waiting for {delay} secs to unfreeze mons'.format( - delay=self.freeze_mon_duration)) - time.sleep(self.freeze_mon_duration) - for mon in mons_to_freeze: - self.unfreeze_mon(mon) - - if self.maintain_quorum: - self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill)) - for m in mons: - if m in mons_to_kill: - continue - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons)-len(mons_to_kill) - - self.log('waiting for {delay} secs before reviving monitors'.format( - delay=self.revive_delay)) - time.sleep(self.revive_delay) - - for mon in mons_to_kill: - self.revive_mon(mon) - # do more freezes - if mons_to_freeze: - for mon in mons_to_freeze: - self.freeze_mon(mon) - self.log('waiting for {delay} secs to unfreeze mons'.format( - delay=self.freeze_mon_duration)) - time.sleep(self.freeze_mon_duration) - for mon in mons_to_freeze: - self.unfreeze_mon(mon) - - self.manager.wait_for_mon_quorum_size(len(mons)) - for m in mons: - s = self.manager.get_mon_status(m) - assert s['state'] == 'leader' or s['state'] == 'peon' - assert len(s['quorum']) == len(mons) - - if self.scrub: - self.log('triggering scrub') - try: - self.manager.raw_cluster_cmd('scrub') - except Exception: - log.exception("Saw exception while triggering scrub") - - if self.thrash_delay > 0.0: - self.log('waiting for {delay} secs before continuing thrashing'.format( - delay=self.thrash_delay)) - time.sleep(self.thrash_delay) - -@contextlib.contextmanager -def task(ctx, config): - """ - Stress test the monitor by thrashing them while another task/workunit - is running. - - Please refer to MonitorThrasher class for further information on the - available options. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'mon_thrash task only accepts a dict for configuration' - assert len(_get_mons(ctx)) > 2, \ - 'mon_thrash task requires at least 3 monitors' - log.info('Beginning mon_thrash...') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - thrash_proc = MonitorThrasher(ctx, - manager, config, - logger=log.getChild('mon_thrasher')) - try: - log.debug('Yielding') - yield - finally: - log.info('joining mon_thrasher') - thrash_proc.do_join() - mons = _get_mons(ctx) - manager.wait_for_mon_quorum_size(len(mons)) diff --git a/teuthology/task/multibench.py b/teuthology/task/multibench.py deleted file mode 100644 index bc22b47059..0000000000 --- a/teuthology/task/multibench.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Multibench testing -""" -import contextlib -import logging -import radosbench -import time -import copy -import gevent - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run multibench - - The config should be as follows: - - multibench: - time: - segments: - radosbench: - - example: - - tasks: - - ceph: - - multibench: - clients: [client.0] - time: 360 - - interactive: - """ - log.info('Beginning multibench...') - assert isinstance(config, dict), \ - "please list clients to run on" - - def run_one(num): - """Run test spawn from gevent""" - start = time.time() - benchcontext = copy.copy(config.get('radosbench')) - iterations = 0 - while time.time() - start < int(config.get('time', 600)): - log.info("Starting iteration %s of segment %s"%(iterations, num)) - benchcontext['pool'] = str(num) + "-" + str(iterations) - with radosbench.task(ctx, benchcontext): - time.sleep() - iterations += 1 - log.info("Starting %s threads"%(str(config.get('segments', 3)),)) - segments = [ - gevent.spawn(run_one, i) - for i in range(0, int(config.get('segments', 3)))] - - try: - yield - finally: - [i.get() for i in segments] diff --git a/teuthology/task/object_source_down.py b/teuthology/task/object_source_down.py deleted file mode 100644 index 1696c55214..0000000000 --- a/teuthology/task/object_source_down.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -Test Object locations going down -""" -import logging -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of object location going down - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.wait_for_clean() - - # something that is always there - dummyfile = '/etc/fstab' - - # take 0, 1 out - manager.mark_out_osd(0) - manager.mark_out_osd(1) - manager.wait_for_clean() - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.0', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.2', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.3', - 'injectargs', - '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000' - ) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile]) - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile]) - - manager.mark_out_osd(3) - manager.wait_till_active() - - manager.mark_in_osd(0) - manager.wait_till_active() - - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - - manager.mark_out_osd(2) - manager.wait_till_active() - - # bring up 1 - manager.mark_in_osd(1) - manager.wait_till_active() - - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - log.info("Getting unfound objects") - unfound = manager.get_num_unfound_objects() - assert not unfound - - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.kill_osd(3) - manager.mark_down_osd(3) - - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - log.info("Getting unfound objects") - unfound = manager.get_num_unfound_objects() - assert unfound diff --git a/teuthology/task/omapbench.py b/teuthology/task/omapbench.py deleted file mode 100644 index 7d25354532..0000000000 --- a/teuthology/task/omapbench.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Run omapbench executable within teuthology -""" -import contextlib -import logging - -from ..orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run omapbench - - The config should be as follows:: - - omapbench: - clients: [client list] - threads: - objects: - entries: - keysize: - valsize: - increment: - omaptype: - - example:: - - tasks: - - ceph: - - omapbench: - clients: [client.0] - threads: 30 - objects: 1000 - entries: 10 - keysize: 10 - valsize: 100 - increment: 100 - omaptype: uniform - - interactive: - """ - log.info('Beginning omapbench...') - assert isinstance(config, dict), \ - "please list clients to run on" - omapbench = {} - testdir = teuthology.get_testdir(ctx) - print(str(config.get('increment',-1))) - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - proc = remote.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'omapbench', - '--name', role[len(PREFIX):], - '-t', str(config.get('threads', 30)), - '-o', str(config.get('objects', 1000)), - '--entries', str(config.get('entries',10)), - '--keysize', str(config.get('keysize',10)), - '--valsize', str(config.get('valsize',1000)), - '--inc', str(config.get('increment',10)), - '--omaptype', str(config.get('omaptype','uniform')) - ]).format(tdir=testdir), - ], - logger=log.getChild('omapbench.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - omapbench[id_] = proc - - try: - yield - finally: - log.info('joining omapbench') - run.wait(omapbench.itervalues()) diff --git a/teuthology/task/osd_backfill.py b/teuthology/task/osd_backfill.py deleted file mode 100644 index f3b59e398c..0000000000 --- a/teuthology/task/osd_backfill.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Osd backfill test -""" -import logging -import ceph_manager -import time -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - - -def rados_start(ctx, remote, cmd): - """ - Run a remote rados command (currently used to only write data) - """ - log.info("rados %s" % ' '.join(cmd)) - testdir = teuthology.get_testdir(ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=False, - ) - return proc - -def task(ctx, config): - """ - Test backfill - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'thrashosds task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 3 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - # write some data - p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096', - '--no-cleanup']) - err = p.wait() - log.info('err is %d' % err) - - # mark osd.0 out to trigger a rebalance/backfill - manager.mark_out_osd(0) - - # also mark it down to it won't be included in pg_temps - manager.kill_osd(0) - manager.mark_down_osd(0) - - # wait for everything to peer and be happy... - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_recovery() - - # write some new data - p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096', - '--no-cleanup']) - - time.sleep(15) - - # blackhole + restart osd.1 - # this triggers a divergent backfill target - manager.blackhole_kill_osd(1) - time.sleep(2) - manager.revive_osd(1) - - # wait for our writes to complete + succeed - err = p.wait() - log.info('err is %d' % err) - - # cluster must recover - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_recovery() - - # re-add osd.0 - manager.revive_osd(0) - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - diff --git a/teuthology/task/osd_failsafe_enospc.py b/teuthology/task/osd_failsafe_enospc.py deleted file mode 100644 index 8f5da58778..0000000000 --- a/teuthology/task/osd_failsafe_enospc.py +++ /dev/null @@ -1,218 +0,0 @@ -""" -Handle osdfailsafe configuration settings (nearfull ratio and full ratio) -""" -from cStringIO import StringIO -import logging -import time - -import ceph_manager -from ..orchestra import run -from teuthology.task_util.rados import rados -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio - configuration settings - - In order for test to pass must use log-whitelist as follows - - tasks: - - chef: - - install: - - ceph: - log-whitelist: ['OSD near full', 'OSD full dropping all updates'] - - osd_failsafe_enospc: - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'osd_failsafe_enospc task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - ctx.manager = manager - - # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding - sleep_time = 50 - - # something that is always there - dummyfile = '/etc/fstab' - dummyfile2 = '/etc/resolv.conf' - - # create 1 pg pool with 1 rep which can only be on osd.0 - osds = manager.get_osd_dump() - for osd in osds: - if osd['osd'] != 0: - manager.mark_out_osd(osd['osd']) - - log.info('creating pool foo') - manager.create_pool("foo") - manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1') - - # State NONE -> NEAR - log.info('1. Verify warning messages when exceeding nearfull_ratio') - - proc = mon.run( - args=[ - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - # State NEAR -> FULL - log.info('2. Verify error messages when exceeding full_ratio') - - proc = mon.run( - args=[ - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count - - log.info('3. Verify write failure when exceeding full_ratio') - - # Write data should fail - ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile]) - assert ret != 0, 'Expected write failure but it succeeded with exit status 0' - - # Put back default - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') - time.sleep(10) - - # State FULL -> NEAR - log.info('4. Verify write success when NOT exceeding full_ratio') - - # Write should succeed - ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2]) - assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret - - log.info('5. Verify warning messages again when exceeding nearfull_ratio') - - proc = mon.run( - args=[ - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90') - time.sleep(10) - - # State NONE -> FULL - log.info('6. Verify error messages again when exceeding full_ratio') - - proc = mon.run( - args=[ - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count - - # State FULL -> NONE - log.info('7. Verify no messages settings back to default') - - manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') - time.sleep(10) - - proc = mon.run( - args=[ - 'daemon-helper', - 'kill', - 'ceph', '-w' - ], - stdin=run.PIPE, - stdout=StringIO(), - wait=False, - ) - - time.sleep(sleep_time) - proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w - proc.wait() - - lines = proc.stdout.getvalue().split('\n') - - count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) - assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count - count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) - assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count - - log.info('Test Passed') - - # Bring all OSDs back in - manager.remove_pool("foo") - for osd in osds: - if osd['osd'] != 0: - manager.mark_in_osd(osd['osd']) diff --git a/teuthology/task/osd_recovery.py b/teuthology/task/osd_recovery.py deleted file mode 100644 index b30e7d4223..0000000000 --- a/teuthology/task/osd_recovery.py +++ /dev/null @@ -1,206 +0,0 @@ -""" -osd recovery -""" -import logging -import ceph_manager -import time -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - - -def rados_start(testdir, remote, cmd): - """ - Run a remote rados command (currently used to only write data) - """ - log.info("rados %s" % ' '.join(cmd)) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - wait=False, - ) - return proc - -def task(ctx, config): - """ - Test (non-backfill) recovery - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - testdir = teuthology.get_testdir(ctx) - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 3 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - # test some osdmap flags - manager.raw_cluster_cmd('osd', 'set', 'noin') - manager.raw_cluster_cmd('osd', 'set', 'noout') - manager.raw_cluster_cmd('osd', 'set', 'noup') - manager.raw_cluster_cmd('osd', 'set', 'nodown') - manager.raw_cluster_cmd('osd', 'unset', 'noin') - manager.raw_cluster_cmd('osd', 'unset', 'noout') - manager.raw_cluster_cmd('osd', 'unset', 'noup') - manager.raw_cluster_cmd('osd', 'unset', 'nodown') - - # write some new data - p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '60', 'write', '-b', '4096', - '--no-cleanup']) - - time.sleep(15) - - # trigger a divergent target: - # blackhole + restart osd.1 (shorter log) - manager.blackhole_kill_osd(1) - # kill osd.2 (longer log... we'll make it divergent below) - manager.kill_osd(2) - time.sleep(2) - manager.revive_osd(1) - - # wait for our writes to complete + succeed - err = p.wait() - log.info('err is %d' % err) - - # cluster must repeer - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_active_or_down() - - # write some more (make sure osd.2 really is divergent) - p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096']) - p.wait() - - # revive divergent osd - manager.revive_osd(2) - - while len(manager.get_osd_status()['up']) < 3: - log.info('waiting a bit...') - time.sleep(2) - log.info('3 are up!') - - # cluster must recover - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - -def test_incomplete_pgs(ctx, config): - """ - Test handling of incomplete pgs. Requires 4 osds. - """ - testdir = teuthology.get_testdir(ctx) - if config is None: - config = {} - assert isinstance(config, dict), \ - 'task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - assert num_osds == 4 - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 4: - time.sleep(10) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_for_clean() - - log.info('Testing incomplete pgs...') - - for i in range(4): - manager.set_config( - i, - osd_recovery_delay_start=1000) - - # move data off of osd.0, osd.1 - manager.raw_cluster_cmd('osd', 'out', '0', '1') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_for_clean() - - # lots of objects in rbd (no pg log, will backfill) - p = rados_start(testdir, mon, - ['-p', 'rbd', 'bench', '60', 'write', '-b', '1', - '--no-cleanup']) - p.wait() - - # few objects in rbd pool (with pg log, normal recovery) - for f in range(1, 20): - p = rados_start(testdir, mon, ['-p', 'rbd', 'put', - 'foo.%d' % f, '/etc/passwd']) - p.wait() - - # move it back - manager.raw_cluster_cmd('osd', 'in', '0', '1') - manager.raw_cluster_cmd('osd', 'out', '2', '3') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') - manager.wait_for_active() - - assert not manager.is_clean() - assert not manager.is_recovered() - - # kill 2 + 3 - log.info('stopping 2,3') - manager.kill_osd(2) - manager.kill_osd(3) - log.info('...') - manager.raw_cluster_cmd('osd', 'down', '2', '3') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_active_or_down() - - assert manager.get_num_down() > 0 - - # revive 2 + 3 - manager.revive_osd(2) - manager.revive_osd(3) - while len(manager.get_osd_status()['up']) < 4: - log.info('waiting a bit...') - time.sleep(2) - log.info('all are up!') - - for i in range(4): - manager.kick_recovery_wq(i) - - # cluster must recover - manager.wait_for_clean() diff --git a/teuthology/task/peer.py b/teuthology/task/peer.py deleted file mode 100644 index 8006c3812a..0000000000 --- a/teuthology/task/peer.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Peer test (Single test, not much configurable here) -""" -import logging -import json - -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test peering. - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'peer task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - for i in range(3): - manager.set_config( - i, - osd_recovery_delay_start=120) - - # take on osd down - manager.kill_osd(2) - manager.mark_down_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_recovery() - - # kill another and revive 2, so that some pgs can't peer. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.revive_osd(2) - manager.wait_till_osd_is_up(2) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - - manager.wait_for_active_or_down() - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - - # look for down pgs - num_down_pgs = 0 - pgs = manager.get_pg_stats() - for pg in pgs: - out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query') - log.debug("out string %s",out) - j = json.loads(out) - log.info("pg is %s, query json is %s", pg, j) - - if pg['state'].count('down'): - num_down_pgs += 1 - # verify that it is blocked on osd.1 - rs = j['recovery_state'] - assert len(rs) > 0 - assert rs[0]['name'] == 'Started/Primary/Peering/GetInfo' - assert rs[1]['name'] == 'Started/Primary/Peering' - assert rs[1]['blocked'] - assert rs[1]['down_osds_we_would_probe'] == [1] - assert len(rs[1]['peering_blocked_by']) == 1 - assert rs[1]['peering_blocked_by'][0]['osd'] == 1 - - assert num_down_pgs > 0 - - # bring it all back - manager.revive_osd(1) - manager.wait_till_osd_is_up(1) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() diff --git a/teuthology/task/peering_speed_test.py b/teuthology/task/peering_speed_test.py deleted file mode 100644 index 6c885f1c96..0000000000 --- a/teuthology/task/peering_speed_test.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Remotely run peering tests. -""" -import logging -import time -from teuthology import misc as teuthology -import ceph_manager - -log = logging.getLogger(__name__) - -from args import argify - -POOLNAME = "POOLNAME" -ARGS = [ - ('num_pgs', 'number of pgs to create', 256, int), - ('max_time', 'seconds to complete peering', 0, int), - ('runs', 'trials to run', 10, int), - ('num_objects', 'objects to create', 256 * 1024, int), - ('object_size', 'size in bytes for objects', 64, int), - ('creation_time_limit', 'time limit for pool population', 60*60, int), - ('create_threads', 'concurrent writes for create', 256, int) - ] - -def setup(ctx, config): - """ - Setup peering test on remotes. - """ - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - ctx.manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - ctx.manager.clear_pools() - ctx.manager.create_pool(POOLNAME, config.num_pgs) - log.info("populating pool") - ctx.manager.rados_write_objects( - POOLNAME, - config.num_objects, - config.object_size, - config.creation_time_limit, - config.create_threads) - log.info("done populating pool") - -def do_run(ctx, config): - """ - Perform the test. - """ - start = time.time() - # mark in osd - ctx.manager.mark_in_osd(0) - log.info("writing out objects") - ctx.manager.rados_write_objects( - POOLNAME, - config.num_pgs, # write 1 object per pg or so - 1, - config.creation_time_limit, - config.num_pgs, # lots of concurrency - cleanup = True) - peering_end = time.time() - - log.info("peering done, waiting on recovery") - ctx.manager.wait_for_clean() - - log.info("recovery done") - recovery_end = time.time() - if config.max_time: - assert(peering_end - start < config.max_time) - ctx.manager.mark_out_osd(0) - ctx.manager.wait_for_clean() - return { - 'time_to_active': peering_end - start, - 'time_to_clean': recovery_end - start - } - -@argify("peering_speed_test", ARGS) -def task(ctx, config): - """ - Peering speed test - """ - setup(ctx, config) - ctx.manager.mark_out_osd(0) - ctx.manager.wait_for_clean() - ret = [] - for i in range(config.runs): - log.info("Run {i}".format(i = i)) - ret.append(do_run(ctx, config)) - - ctx.manager.mark_in_osd(0) - ctx.summary['recovery_times'] = { - 'runs': ret - } diff --git a/teuthology/task/populate_rbd_pool.py b/teuthology/task/populate_rbd_pool.py deleted file mode 100644 index df9b683b85..0000000000 --- a/teuthology/task/populate_rbd_pool.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Populate rbd pools -""" -import contextlib -import logging -from ceph_manager import CephManager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Populate pools with prefix with - rbd images at snaps - - The config could be as follows:: - - populate_rbd_pool: - client: - pool_prefix: foo - num_pools: 5 - num_images: 10 - num_snaps: 3 - image_size: 10737418240 - """ - if config is None: - config = {} - client = config.get("client", "client.0") - pool_prefix = config.get("pool_prefix", "foo") - num_pools = config.get("num_pools", 2) - num_images = config.get("num_images", 20) - num_snaps = config.get("num_snaps", 4) - image_size = config.get("image_size", 100) - write_size = config.get("write_size", 1024*1024) - write_threads = config.get("write_threads", 10) - write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30) - - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - - if not hasattr(ctx, 'manager'): - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - ctx.manager = CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - for poolid in range(num_pools): - poolname = "%s-%s" % (pool_prefix, str(poolid)) - log.info("Creating pool %s" % (poolname,)) - ctx.manager.create_pool(poolname) - for imageid in range(num_images): - imagename = "rbd-%s" % (str(imageid),) - log.info("Creating imagename %s" % (imagename,)) - remote.run( - args = [ - "rbd", - "create", - imagename, - "--image-format", "1", - "--size", str(image_size), - "--pool", str(poolname)]) - def bench_run(): - remote.run( - args = [ - "rbd", - "bench-write", - imagename, - "--pool", poolname, - "--io-size", str(write_size), - "--io-threads", str(write_threads), - "--io-total", str(write_total_per_snap), - "--io-pattern", "rand"]) - log.info("imagename %s first bench" % (imagename,)) - bench_run() - for snapid in range(num_snaps): - snapname = "snap-%s" % (str(snapid),) - log.info("imagename %s creating snap %s" % (imagename, snapname)) - remote.run( - args = [ - "rbd", "snap", "create", - "--pool", poolname, - "--snap", snapname, - imagename - ]) - bench_run() - - try: - yield - finally: - log.info('done') diff --git a/teuthology/task/qemu.py b/teuthology/task/qemu.py deleted file mode 100644 index a05b4dba0a..0000000000 --- a/teuthology/task/qemu.py +++ /dev/null @@ -1,327 +0,0 @@ -""" -Qemu task -""" -from cStringIO import StringIO - -import contextlib -import logging -import os - -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.task import rbd -from ..orchestra import run - -log = logging.getLogger(__name__) - -DEFAULT_NUM_RBD = 1 -DEFAULT_IMAGE_URL = 'http://ceph.com/qa/ubuntu-12.04.qcow2' -DEFAULT_MEM = 4096 # in megabytes - -@contextlib.contextmanager -def create_dirs(ctx, config): - """ - Handle directory creation and cleanup - """ - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'install', '-d', '-m0755', '--', - '{tdir}/qemu'.format(tdir=testdir), - '{tdir}/archive/qemu'.format(tdir=testdir), - ] - ) - try: - yield - finally: - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true', - ] - ) - -@contextlib.contextmanager -def generate_iso(ctx, config): - """Execute system commands to generate iso""" - log.info('generating iso...') - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - assert 'test' in client_config, 'You must specify a test to run' - src_dir = os.path.dirname(__file__) - userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client) - metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client) - - with file(os.path.join(src_dir, 'userdata_setup.yaml'), 'rb') as f: - test_setup = ''.join(f.readlines()) - - with file(os.path.join(src_dir, 'userdata_teardown.yaml'), 'rb') as f: - test_teardown = ''.join(f.readlines()) - - user_data = test_setup - if client_config.get('type', 'filesystem') == 'filesystem': - for i in xrange(0, client_config.get('num_rbd', DEFAULT_NUM_RBD)): - dev_letter = chr(ord('b') + i) - user_data += """ -- | - #!/bin/bash - mkdir /mnt/test_{dev_letter} - mkfs -t xfs /dev/vd{dev_letter} - mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter} -""".format(dev_letter=dev_letter) - - # this may change later to pass the directories as args to the - # script or something. xfstests needs that. - user_data += """ -- | - #!/bin/bash - test -d /mnt/test_b && cd /mnt/test_b - /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success -""" + test_teardown - - (remote,) = ctx.cluster.only(client).remotes.keys() - teuthology.write_file(remote, userdata_path, StringIO(user_data)) - - with file(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f: - teuthology.write_file(remote, metadata_path, f) - - test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client) - remote.run( - args=[ - 'wget', '-nv', '-O', test_file, - client_config['test'], - run.Raw('&&'), - 'chmod', '755', test_file, - ], - ) - remote.run( - args=[ - 'genisoimage', '-quiet', '-input-charset', 'utf-8', - '-volid', 'cidata', '-joliet', '-rock', - '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - '-graft-points', - 'user-data={userdata}'.format(userdata=userdata_path), - 'meta-data={metadata}'.format(metadata=metadata_path), - 'test.sh={file}'.format(file=test_file), - ], - ) - try: - yield - finally: - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rm', '-f', - '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - os.path.join(testdir, 'qemu', 'userdata.' + client), - os.path.join(testdir, 'qemu', 'metadata.' + client), - '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client), - ], - ) - -@contextlib.contextmanager -def download_image(ctx, config): - """Downland base image, remove image file when done""" - log.info('downloading base image') - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client) - remote.run( - args=[ - 'wget', '-nv', '-O', base_file, DEFAULT_IMAGE_URL, - ] - ) - try: - yield - finally: - log.debug('cleaning up base image files') - for client in config.iterkeys(): - base_file = '{tdir}/qemu/base.{client}.qcow2'.format( - tdir=testdir, - client=client, - ) - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'rm', '-f', base_file, - ], - ) - -@contextlib.contextmanager -def run_qemu(ctx, config): - """Setup kvm environment and start qemu""" - procs = [] - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client) - remote.run( - args=[ - 'mkdir', log_dir, run.Raw('&&'), - 'sudo', 'modprobe', 'kvm', - ] - ) - - base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client) - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'daemon-helper', - 'term', - 'qemu-system-x86_64', '-enable-kvm', '-nographic', - '-m', str(client_config.get('memory', DEFAULT_MEM)), - # base OS device - '-drive', - 'file={base},format=qcow2,if=virtio'.format(base=base_file), - # cd holding metadata for cloud-init - '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client), - # virtio 9p fs for logging - '-fsdev', - 'local,id=log,path={log},security_model=none'.format(log=log_dir), - '-device', - 'virtio-9p-pci,fsdev=log,mount_tag=test_log', - ] - - cachemode = 'none' - ceph_config = ctx.ceph.conf.get('global', {}) - ceph_config.update(ctx.ceph.conf.get('client', {})) - ceph_config.update(ctx.ceph.conf.get(client, {})) - if ceph_config.get('rbd cache'): - if ceph_config.get('rbd cache max dirty', 1) > 0: - cachemode = 'writeback' - else: - cachemode = 'writethrough' - - for i in xrange(client_config.get('num_rbd', DEFAULT_NUM_RBD)): - args.extend([ - '-drive', - 'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'.format( - img='{client}.{num}'.format(client=client, num=i), - id=client[len('client.'):], - cachemode=cachemode, - ), - ]) - - log.info('starting qemu...') - procs.append( - remote.run( - args=args, - logger=log.getChild(client), - stdin=run.PIPE, - wait=False, - ) - ) - - try: - yield - finally: - log.info('waiting for qemu tests to finish...') - run.wait(procs) - - log.debug('checking that qemu tests succeeded...') - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'test', '-f', - '{tdir}/archive/qemu/{client}/success'.format( - tdir=testdir, - client=client - ), - ], - ) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run a test inside of QEMU on top of rbd. Only one test - is supported per client. - - For example, you can specify which clients to run on:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - client.1: - test: http://ceph.com/qa/test2.sh - - Or use the same settings on all clients: - - tasks: - - ceph: - - qemu: - all: - test: http://ceph.com/qa/test.sh - - For tests that don't need a filesystem, set type to block:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - type: block - - The test should be configured to run on /dev/vdb and later - devices. - - If you want to run a test that uses more than one rbd image, - specify how many images to use:: - - tasks: - - ceph: - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - type: block - num_rbd: 2 - - You can set the amount of memory the VM has (default is 1024 MB):: - - tasks: - - ceph: - - qemu: - client.0: - test: http://ceph.com/qa/test.sh - memory: 512 # megabytes - """ - assert isinstance(config, dict), \ - "task qemu only supports a dictionary for configuration" - - config = teuthology.replace_all_with_clients(ctx.cluster, config) - - managers = [] - for client, client_config in config.iteritems(): - num_rbd = client_config.get('num_rbd', 1) - assert num_rbd > 0, 'at least one rbd device must be used' - for i in xrange(num_rbd): - create_config = { - client: { - 'image_name': - '{client}.{num}'.format(client=client, num=i), - } - } - managers.append( - lambda create_config=create_config: - rbd.create_image(ctx=ctx, config=create_config) - ) - - managers.extend([ - lambda: create_dirs(ctx=ctx, config=config), - lambda: generate_iso(ctx=ctx, config=config), - lambda: download_image(ctx=ctx, config=config), - lambda: run_qemu(ctx=ctx, config=config), - ]) - - with contextutil.nested(*managers): - yield diff --git a/teuthology/task/rados.py b/teuthology/task/rados.py deleted file mode 100644 index 28cccf259f..0000000000 --- a/teuthology/task/rados.py +++ /dev/null @@ -1,200 +0,0 @@ -""" -Rados modle-based integration tests -""" -import contextlib -import logging -import gevent -from ceph_manager import CephManager -from teuthology import misc as teuthology - -from ..orchestra import run - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run RadosModel-based integration tests. - - The config should be as follows:: - - rados: - clients: [client list] - ops: - objects: - max_in_flight: - object_size: - min_stride_size: - max_stride_size: - op_weights: - runs: - the pool is remade between runs - ec_pool: use an ec pool - erasure_code_profile: profile to use with the erasure coded pool - pool_snaps: use pool snapshots instead of selfmanaged snapshots - - For example:: - - tasks: - - ceph: - - rados: - clients: [client.0] - ops: 1000 - max_seconds: 0 # 0 for no limit - objects: 25 - max_in_flight: 16 - object_size: 4000000 - min_stride_size: 1024 - max_stride_size: 4096 - op_weights: - read: 20 - write: 10 - delete: 2 - snap_create: 3 - rollback: 2 - snap_remove: 0 - ec_pool: create an ec pool, defaults to False - erasure_code_profile: - name: teuthologyprofile - k: 2 - m: 1 - ruleset-failure-domain: osd - pool_snaps: true - runs: 10 - - interactive: - - Optionally, you can provide the pool name to run against: - - tasks: - - ceph: - - exec: - client.0: - - ceph osd pool create foo - - rados: - clients: [client.0] - pools: [foo] - ... - - Alternatively, you can provide a pool prefix: - - tasks: - - ceph: - - exec: - client.0: - - ceph osd pool create foo.client.0 - - rados: - clients: [client.0] - pool_prefix: foo - ... - - """ - log.info('Beginning rados...') - assert isinstance(config, dict), \ - "please list clients to run on" - - object_size = int(config.get('object_size', 4000000)) - op_weights = config.get('op_weights', {}) - testdir = teuthology.get_testdir(ctx) - args = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph_test_rados'] - if config.get('ec_pool', False): - args.extend(['--ec-pool']) - if config.get('pool_snaps', False): - args.extend(['--pool-snaps']) - args.extend([ - '--op', 'read', str(op_weights.get('read', 100)), - '--op', 'write', str(op_weights.get('write', 100)), - '--op', 'delete', str(op_weights.get('delete', 10)), - '--max-ops', str(config.get('ops', 10000)), - '--objects', str(config.get('objects', 500)), - '--max-in-flight', str(config.get('max_in_flight', 16)), - '--size', str(object_size), - '--min-stride-size', str(config.get('min_stride_size', object_size / 10)), - '--max-stride-size', str(config.get('max_stride_size', object_size / 5)), - '--max-seconds', str(config.get('max_seconds', 0)) - ]) - # Parallel of the op_types in test/osd/TestRados.cc - for field in [ - # read handled above - # write handled above - # delete handled above - "snap_create", - "snap_remove", - "rollback", - "setattr", - "rmattr", - "watch", - "copy_from", - "hit_set_list", - "is_dirty", - "undirty", - "cache_flush", - "cache_try_flush", - "cache_evict", - "append", - ]: - if field in op_weights: - args.extend([ - '--op', field, str(op_weights[field]), - ]) - - def thread(): - """Thread spawned by gevent""" - if not hasattr(ctx, 'manager'): - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - ctx.manager = CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - log.info('clients are %s' % clients) - if config.get('ec_pool', False): - profile = config.get('erasure_code_profile', {}) - profile_name = profile.get('name', 'teuthologyprofile') - ctx.manager.create_erasure_code_profile(profile_name, profile) - else: - profile_name = None - for i in range(int(config.get('runs', '1'))): - log.info("starting run %s out of %s", str(i), config.get('runs', '1')) - tests = {} - existing_pools = config.get('pools', []) - created_pools = [] - for role in config.get('clients', clients): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - - pool = config.get('pool', None) - if not pool and existing_pools: - pool = existing_pools.pop() - else: - pool = ctx.manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name) - created_pools.append(pool) - - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - proc = remote.run( - args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args + - ["--pool", pool], - logger=log.getChild("rados.{id}".format(id=id_)), - stdin=run.PIPE, - wait=False - ) - tests[id_] = proc - run.wait(tests.itervalues()) - - for pool in created_pools: - ctx.manager.remove_pool(pool) - - running = gevent.spawn(thread) - - try: - yield - finally: - log.info('joining rados') - running.get() diff --git a/teuthology/task/radosbench.py b/teuthology/task/radosbench.py deleted file mode 100644 index 53a54f6e08..0000000000 --- a/teuthology/task/radosbench.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Rados benchmarking -""" -import contextlib -import logging - -from ..orchestra import run -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run radosbench - - The config should be as follows: - - radosbench: - clients: [client list] - time: - pool: - unique_pool: use a unique pool, defaults to False - ec_pool: create an ec pool, defaults to False - erasure_code_profile: - name: teuthologyprofile - k: 2 - m: 1 - ruleset-failure-domain: osd - - example: - - tasks: - - ceph: - - radosbench: - clients: [client.0] - time: 360 - - interactive: - """ - log.info('Beginning radosbench...') - assert isinstance(config, dict), \ - "please list clients to run on" - radosbench = {} - - testdir = teuthology.get_testdir(ctx) - - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - - if config.get('ec_pool', False): - profile = config.get('erasure_code_profile', {}) - profile_name = profile.get('name', 'teuthologyprofile') - ctx.manager.create_erasure_code_profile(profile_name, profile) - else: - profile_name = None - - pool = 'data' - if config.get('pool'): - pool = config.get('pool') - if pool is not 'data': - ctx.manager.create_pool(pool, erasure_code_profile_name=profile_name) - else: - pool = ctx.manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name) - - proc = remote.run( - args=[ - "/bin/sh", "-c", - " ".join(['adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage', - 'rados', - '--name', role, - '-p' , pool, - 'bench', str(config.get('time', 360)), 'write', - ]).format(tdir=testdir), - ], - logger=log.getChild('radosbench.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - radosbench[id_] = proc - - try: - yield - finally: - timeout = config.get('time', 360) * 5 - log.info('joining radosbench (timing out after %ss)', timeout) - run.wait(radosbench.itervalues(), timeout=timeout) - - if pool is not 'data': - ctx.manager.remove_pool(pool) diff --git a/teuthology/task/radosgw_admin.py b/teuthology/task/radosgw_admin.py deleted file mode 100644 index c57b55203a..0000000000 --- a/teuthology/task/radosgw_admin.py +++ /dev/null @@ -1,983 +0,0 @@ -""" -Rgw admin testing against a running instance -""" -# The test cases in this file have been annotated for inventory. -# To extract the inventory (in csv format) use the command: -# -# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' -# - -import copy -import json -import logging -import time - -from cStringIO import StringIO - -import boto.exception -import boto.s3.connection -import boto.s3.acl - -import teuthology.task_util.rgw as rgw_utils - -from teuthology import misc as teuthology -from teuthology.task_util.rgw import rgwadmin - -log = logging.getLogger(__name__) - - -def successful_ops(out): - """Extract total from the first summary entry (presumed to be only one)""" - summary = out['summary'] - if len(summary) == 0: - return 0 - entry = summary[0] - return entry['total']['successful_ops'] - - -def task(ctx, config): - """ - Test radosgw-admin functionality against a running rgw instance. - """ - global log - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - multi_region_run = rgw_utils.multi_region_enabled(ctx) - - client = clients[0]; # default choice, multi-region code may overwrite this - if multi_region_run: - client = rgw_utils.get_master_client(ctx, clients) - - # once the client is chosen, pull the host name and assigned port out of - # the role_endpoints that were assigned by the rgw task - (remote_host, remote_port) = ctx.rgw.role_endpoints[client] - - ## - user1='foo' - user2='fud' - subuser1='foo:foo1' - subuser2='foo:foo2' - display_name1='Foo' - display_name2='Fud' - email='foo@foo.com' - email2='bar@bar.com' - access_key='9te6NH5mcdcq0Tc5i8i1' - secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu' - access_key2='p5YnriCv1nAtykxBrupQ' - secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh' - swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL' - swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy' - - bucket_name='myfoo' - bucket_name2='mybar' - - # connect to rgw - connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=remote_port, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - connection2 = boto.s3.connection.S3Connection( - aws_access_key_id=access_key2, - aws_secret_access_key=secret_key2, - is_secure=False, - port=remote_port, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # legend (test cases can be easily grep-ed out) - # TESTCASE 'testname','object','method','operation','assertion' - # TESTCASE 'info-nosuch','user','info','non-existent user','fails' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - assert err - - # TESTCASE 'create-ok','user','create','w/all valid info','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user1, - '--display-name', display_name1, - '--email', email, - '--access-key', access_key, - '--secret', secret_key, - '--max-buckets', '4' - ], - check_status=True) - - # TESTCASE 'duplicate email','user','create','existing user email','fails' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user2, - '--display-name', display_name2, - '--email', email, - ]) - assert err - - # TESTCASE 'info-existing','user','info','existing user','returns correct info' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert out['user_id'] == user1 - assert out['email'] == email - assert out['display_name'] == display_name1 - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - assert not out['suspended'] - - # this whole block should only be run if regions have been configured - if multi_region_run: - rgw_utils.radosgw_agent_sync_all(ctx) - # post-sync, validate that user1 exists on the sync destination host - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - dest_client = c_config['dest'] - (err, out) = rgwadmin(ctx, dest_client, ['metadata', 'list', 'user']) - (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True) - assert out['user_id'] == user1 - assert out['email'] == email - assert out['display_name'] == display_name1 - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - assert not out['suspended'] - - # compare the metadata between different regions, make sure it matches - log.debug('compare the metadata between different regions, make sure it matches') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err1, out1) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, - ['metadata', 'get', 'user:{uid}'.format(uid=user1)], check_status=True) - assert out1 == out2 - - # suspend a user on the master, then check the status on the destination - log.debug('suspend a user on the master, then check the status on the destination') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err, out) = rgwadmin(ctx, source_client, ['user', 'suspend', '--uid', user1]) - rgw_utils.radosgw_agent_sync_all(ctx) - (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1], check_status=True) - assert out['suspended'] - - # delete a user on the master, then check that it's gone on the destination - log.debug('delete a user on the master, then check that it\'s gone on the destination') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err, out) = rgwadmin(ctx, source_client, ['user', 'rm', '--uid', user1], check_status=True) - rgw_utils.radosgw_agent_sync_all(ctx) - (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user1]) - assert out is None - (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user1]) - assert out is None - - # then recreate it so later tests pass - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user1, - '--display-name', display_name1, - '--email', email, - '--access-key', access_key, - '--secret', secret_key, - '--max-buckets', '4' - ], - check_status=True) - - # now do the multi-region bucket tests - log.debug('now do the multi-region bucket tests') - - # Create a second user for the following tests - log.debug('Create a second user for the following tests') - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user2, - '--display-name', display_name2, - '--email', email2, - '--access-key', access_key2, - '--secret', secret_key2, - '--max-buckets', '4' - ], - check_status=True) - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user2], check_status=True) - assert out is not None - - # create a bucket and do a sync - log.debug('create a bucket and do a sync') - bucket = connection.create_bucket(bucket_name2) - rgw_utils.radosgw_agent_sync_all(ctx) - - # compare the metadata for the bucket between different regions, make sure it matches - log.debug('compare the metadata for the bucket between different regions, make sure it matches') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err1, out1) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - assert out1 == out2 - - # get the bucket.instance info and compare that - src_bucket_id = out1['data']['bucket']['bucket_id'] - dest_bucket_id = out2['data']['bucket']['bucket_id'] - (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get', - 'bucket.instance:{bucket_name}:{bucket_instance}'.format( - bucket_name=bucket_name2,bucket_instance=src_bucket_id)], - check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get', - 'bucket.instance:{bucket_name}:{bucket_instance}'.format( - bucket_name=bucket_name2,bucket_instance=dest_bucket_id)], - check_status=True) - del out1['data']['bucket_info']['bucket']['pool'] - del out1['data']['bucket_info']['bucket']['index_pool'] - del out2['data']['bucket_info']['bucket']['pool'] - del out2['data']['bucket_info']['bucket']['index_pool'] - assert out1 == out2 - - same_region = 0 - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - - source_region = rgw_utils.region_for_client(ctx, source_client) - dest_region = rgw_utils.region_for_client(ctx, dest_client) - - # 301 is only returned for requests to something in a different region - if source_region == dest_region: - log.debug('301 is only returned for requests to something in a different region') - same_region += 1 - continue - - # Attempt to create a new connection with user1 to the destination RGW - log.debug('Attempt to create a new connection with user1 to the destination RGW') - # and use that to attempt a delete (that should fail) - exception_encountered = False - try: - (dest_remote_host, dest_remote_port) = ctx.rgw.role_endpoints[dest_client] - connection_dest = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=dest_remote_port, - host=dest_remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # this should fail - connection_dest.delete_bucket(bucket_name2) - except boto.exception.S3ResponseError as e: - assert e.status == 301 - exception_encountered = True - - # confirm that the expected exception was seen - assert exception_encountered - - # now delete the bucket on the source RGW and do another sync - log.debug('now delete the bucket on the source RGW and do another sync') - bucket.delete() - rgw_utils.radosgw_agent_sync_all(ctx) - - if same_region == len(ctx.radosgw_agent.config): - bucket.delete() - rgw_utils.radosgw_agent_sync_all(ctx) - - # make sure that the bucket no longer exists in either region - log.debug('make sure that the bucket no longer exists in either region') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err1, out1) = rgwadmin(ctx, source_client, ['metadata', 'get', - 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)]) - (err2, out2) = rgwadmin(ctx, dest_client, ['metadata', 'get', - 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)]) - # Both of the previous calls should have errors due to requesting - # metadata for non-existent buckets - assert err1 - assert err2 - - # create a bucket and then sync it - log.debug('create a bucket and then sync it') - bucket = connection.create_bucket(bucket_name2) - rgw_utils.radosgw_agent_sync_all(ctx) - - # compare the metadata for the bucket between different regions, make sure it matches - log.debug('compare the metadata for the bucket between different regions, make sure it matches') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err1, out1) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - assert out1 == out2 - - # Now delete the bucket and recreate it with a different user - log.debug('Now delete the bucket and recreate it with a different user') - # within the same window of time and then sync. - bucket.delete() - bucket = connection2.create_bucket(bucket_name2) - rgw_utils.radosgw_agent_sync_all(ctx) - - # compare the metadata for the bucket between different regions, make sure it matches - log.debug('compare the metadata for the bucket between different regions, make sure it matches') - # user2 should own the bucket in both regions - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err1, out1) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - assert out1 == out2 - assert out1['data']['owner'] == user2 - assert out1['data']['owner'] != user1 - - # now we're going to use this bucket to test meta-data update propagation - log.debug('now we\'re going to use this bucket to test meta-data update propagation') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - - # get the metadata so we can tweak it - log.debug('get the metadata so we can tweak it') - (err, orig_data) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - - # manually edit mtime for this bucket to be 300 seconds in the past - log.debug('manually edit mtime for this bucket to be 300 seconds in the past') - new_data = copy.deepcopy(orig_data) - new_data['mtime'] = orig_data['mtime'] - 300 - assert new_data != orig_data - (err, out) = rgwadmin(ctx, source_client, - ['metadata', 'put', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - stdin=StringIO(json.dumps(new_data)), - check_status=True) - - # get the metadata and make sure that the 'put' worked - log.debug('get the metadata and make sure that the \'put\' worked') - (err, out) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - assert out == new_data - - # sync to propagate the new metadata - log.debug('sync to propagate the new metadata') - rgw_utils.radosgw_agent_sync_all(ctx) - - # get the metadata from the dest and compare it to what we just set - log.debug('get the metadata from the dest and compare it to what we just set') - # and what the source region has. - (err1, out1) = rgwadmin(ctx, source_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - (err2, out2) = rgwadmin(ctx, dest_client, - ['metadata', 'get', 'bucket:{bucket_name}'.format(bucket_name=bucket_name2)], - check_status=True) - # yeah for the transitive property - assert out1 == out2 - assert out1 == new_data - - # now we delete the bucket - log.debug('now we delete the bucket') - bucket.delete() - - log.debug('sync to propagate the deleted bucket') - rgw_utils.radosgw_agent_sync_all(ctx) - - # Delete user2 as later tests do not expect it to exist. - # Verify that it is gone on both regions - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - source_client = c_config['src'] - dest_client = c_config['dest'] - (err, out) = rgwadmin(ctx, source_client, - ['user', 'rm', '--uid', user2], check_status=True) - rgw_utils.radosgw_agent_sync_all(ctx) - # The two 'user info' calls should fail and not return any data - # since we just deleted this user. - (err, out) = rgwadmin(ctx, source_client, ['user', 'info', '--uid', user2]) - assert out is None - (err, out) = rgwadmin(ctx, dest_client, ['user', 'info', '--uid', user2]) - assert out is None - - # Test data sync - - # First create a bucket for data sync test purpose - bucket = connection.create_bucket(bucket_name + 'data') - - # Create a tiny file and check if in sync - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - if c_config.get('metadata-only'): - continue - - source_client = c_config['src'] - dest_client = c_config['dest'] - k = boto.s3.key.Key(bucket) - k.key = 'tiny_file' - k.set_contents_from_string("123456789") - time.sleep(rgw_utils.radosgw_data_log_window(ctx, source_client)) - rgw_utils.radosgw_agent_sync_all(ctx, data=True) - (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client] - dest_connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=dest_port, - host=dest_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - dest_k = dest_connection.get_bucket(bucket_name + 'data').get_key('tiny_file') - assert k.get_contents_as_string() == dest_k.get_contents_as_string() - - # check that deleting it removes it from the dest zone - k.delete() - time.sleep(rgw_utils.radosgw_data_log_window(ctx, source_client)) - rgw_utils.radosgw_agent_sync_all(ctx, data=True) - - dest_bucket = dest_connection.get_bucket(bucket_name + 'data') - dest_k = dest_bucket.get_key('tiny_file') - assert dest_k == None, 'object not deleted from destination zone' - - # finally we delete the bucket - bucket.delete() - - bucket = connection.create_bucket(bucket_name + 'data2') - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - if c_config.get('metadata-only'): - continue - - source_client = c_config['src'] - dest_client = c_config['dest'] - (dest_host, dest_port) = ctx.rgw.role_endpoints[dest_client] - dest_connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=dest_port, - host=dest_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - for i in range(20): - k = boto.s3.key.Key(bucket) - k.key = 'tiny_file_' + str(i) - k.set_contents_from_string(str(i) * 100) - - time.sleep(rgw_utils.radosgw_data_log_window(ctx, source_client)) - rgw_utils.radosgw_agent_sync_all(ctx, data=True) - - for i in range(20): - dest_k = dest_connection.get_bucket(bucket_name + 'data2').get_key('tiny_file_' + str(i)) - assert (str(i) * 100) == dest_k.get_contents_as_string() - k = boto.s3.key.Key(bucket) - k.key = 'tiny_file_' + str(i) - k.delete() - - # check that deleting removes the objects from the dest zone - time.sleep(rgw_utils.radosgw_data_log_window(ctx, source_client)) - rgw_utils.radosgw_agent_sync_all(ctx, data=True) - - for i in range(20): - dest_bucket = dest_connection.get_bucket(bucket_name + 'data2') - dest_k = dest_bucket.get_key('tiny_file_' + str(i)) - assert dest_k == None, 'object %d not deleted from destination zone' % i - bucket.delete() - - # end of 'if multi_region_run:' - - # TESTCASE 'suspend-ok','user','suspend','active user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1], - check_status=True) - - # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert out['suspended'] - - # TESTCASE 're-enable','user','enable','suspended user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True) - - # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert not out['suspended'] - - # TESTCASE 'add-keys','key','create','w/valid info','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'create', '--uid', user1, - '--access-key', access_key2, '--secret', secret_key2, - ], check_status=True) - - # TESTCASE 'info-new-key','user','info','after key addition','returns all keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], - check_status=True) - assert len(out['keys']) == 2 - assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2 - assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2 - - # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'rm', '--uid', user1, - '--access-key', access_key2, - ], check_status=True) - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - subuser_access = 'full' - subuser_perm = 'full-control' - - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'create', '--subuser', subuser1, - '--access', subuser_access - ], check_status=True) - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'modify', '--subuser', subuser1, - '--secret', swift_secret1, - '--key-type', 'swift', - ], check_status=True) - - # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - - assert out['subusers'][0]['permissions'] == subuser_perm - - # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert len(out['swift_keys']) == 1 - assert out['swift_keys'][0]['user'] == subuser1 - assert out['swift_keys'][0]['secret_key'] == swift_secret1 - - # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'create', '--subuser', subuser2, - '--secret', swift_secret2, - '--key-type', 'swift', - ], check_status=True) - - # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True) - assert len(out['swift_keys']) == 2 - assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2 - assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2 - - # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'key', 'rm', '--subuser', subuser1, - '--key-type', 'swift', - ], check_status=True) - assert len(out['swift_keys']) == 1 - - # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'rm', '--subuser', subuser1, - ], check_status=True) - assert len(out['subusers']) == 1 - - # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed' - (err, out) = rgwadmin(ctx, client, [ - 'subuser', 'rm', '--subuser', subuser2, - '--key-type', 'swift', '--purge-keys', - ], check_status=True) - assert len(out['swift_keys']) == 0 - assert len(out['subusers']) == 0 - - # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], - check_status=True) - assert len(out) == 0 - - if multi_region_run: - rgw_utils.radosgw_agent_sync_all(ctx) - - # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True) - assert len(out) == 0 - - # create a first bucket - bucket = connection.create_bucket(bucket_name) - - # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True) - assert len(out) == 1 - assert out[0] == bucket_name - - # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list' - (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True) - assert len(out) >= 1 - assert bucket_name in out; - - # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4' - bucket2 = connection.create_bucket(bucket_name + '2') - bucket3 = connection.create_bucket(bucket_name + '3') - bucket4 = connection.create_bucket(bucket_name + '4') - # the 5th should fail. - failed = False - try: - connection.create_bucket(bucket_name + '5') - except Exception: - failed = True - assert failed - - # delete the buckets - bucket2.delete() - bucket3.delete() - bucket4.delete() - - # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], check_status=True) - assert out['owner'] == user1 - bucket_id = out['id'] - - # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID' - (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True) - assert len(out) == 1 - assert out[0]['id'] == bucket_id # does it return the same ID twice in a row? - - # use some space - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('one') - - # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], check_status=True) - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 1 - assert out['usage']['rgw.main']['size_kb'] > 0 - - # reclaim it - key.delete() - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error' - (err, out) = rgwadmin(ctx, client, - ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name], - check_status=True) - - # create a second user to link the bucket to - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', user2, - '--display-name', display_name2, - '--access-key', access_key2, - '--secret', secret_key2, - '--max-buckets', '1', - ], - check_status=True) - - # try creating an object with the first user before the bucket is relinked - denied = False - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('two') - except boto.exception.S3ResponseError: - denied = True - - assert not denied - - # delete the object - key.delete() - - # link the bucket to another user - (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)], - check_status=True) - - bucket_data = out['data'] - assert bucket_data['bucket']['name'] == bucket_name - - bucket_id = bucket_data['bucket']['bucket_id'] - - # link the bucket to another user - (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id], - check_status=True) - - # try to remove user, should fail (has a linked bucket) - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2]) - assert err - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked' - (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name], - check_status=True) - - # relink the bucket to the first user and delete the second user - (err, out) = rgwadmin(ctx, client, - ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id], - check_status=True) - - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2], - check_status=True) - - # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' - - # upload an object - object_name = 'four' - key = boto.s3.key.Key(bucket, object_name) - key.set_contents_from_string(object_name) - - # now delete it - (err, out) = rgwadmin(ctx, client, - ['object', 'rm', '--bucket', bucket_name, '--object', object_name], - check_status=True) - - # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects' - (err, out) = rgwadmin(ctx, client, [ - 'bucket', 'stats', '--bucket', bucket_name], - check_status=True) - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 0 - - # list log objects - # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects' - (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True) - assert len(out) > 0 - - for obj in out: - # TESTCASE 'log-show','log','show','after activity','returns expected info' - if obj[:4] == 'meta' or obj[:4] == 'data': - continue - - (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj], - check_status=True) - assert len(rgwlog) > 0 - - # exempt bucket_name2 from checking as it was only used for multi-region tests - assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0 - assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id - assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2 - for entry in rgwlog['log_entries']: - log.debug('checking log entry: ', entry) - assert entry['bucket'] == rgwlog['bucket'] - possible_buckets = [bucket_name + '5', bucket_name2] - user = entry['user'] - assert user == user1 or user.endswith('system-user') or \ - rgwlog['bucket'] in possible_buckets - - # TESTCASE 'log-rm','log','rm','delete log objects','succeeds' - (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj], - check_status=True) - - # TODO: show log by bucket+date - - # need to wait for all usage data to get flushed, should take up to 30 seconds - timestamp = time.time() - while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj']) # last operation we did is delete obj, wait for it to flush - if successful_ops(out) > 0: - break; - time.sleep(1) - - assert time.time() - timestamp <= (20 * 60) - - # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds' - (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True) - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - total = user_summary['total'] - assert total['successful_ops'] > 0 - - # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds' - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1], - check_status=True) - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - for entry in user_summary['categories']: - assert entry['successful_ops'] > 0 - assert user_summary['user'] == user1 - - # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds' - test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket'] - for cat in test_categories: - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat], - check_status=True) - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - assert user_summary['user'] == user1 - assert len(user_summary['categories']) == 1 - entry = user_summary['categories'][0] - assert entry['category'] == cat - assert entry['successful_ops'] > 0 - - # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed' - (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1], - check_status=True) - (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1], - check_status=True) - assert len(out['entries']) == 0 - assert len(out['summary']) == 0 - - # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1], - check_status=True) - - # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects' - try: - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('five') - except boto.exception.S3ResponseError as e: - assert e.status == 403 - - # TESTCASE 'user-renable2','user','enable','suspended user','succeeds' - (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], - check_status=True) - - # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects' - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('six') - - # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection' - - # create an object large enough to be split into multiple parts - test_string = 'foo'*10000000 - - big_key = boto.s3.key.Key(bucket) - big_key.set_contents_from_string(test_string) - - # now delete the head - big_key.delete() - - # wait a bit to give the garbage collector time to cycle - time.sleep(15) - - (err, out) = rgwadmin(ctx, client, ['gc', 'list']) - - assert len(out) > 0 - - # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage' - (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True) - - #confirm - (err, out) = rgwadmin(ctx, client, ['gc', 'list']) - - assert len(out) == 0 - - # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets' - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1]) - assert err - - # delete should fail because ``key`` still exists - try: - bucket.delete() - except boto.exception.S3ResponseError as e: - assert e.status == 409 - - key.delete() - bucket.delete() - - # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy' - bucket = connection.create_bucket(bucket_name) - - # create an object - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('seven') - - # should be private already but guarantee it - key.set_acl('private') - - (err, out) = rgwadmin(ctx, client, - ['policy', '--bucket', bucket.name, '--object', key.key], - check_status=True) - - acl = key.get_xml_acl() - - assert acl == out.strip('\n') - - # add another grantee by making the object public read - key.set_acl('public-read') - - (err, out) = rgwadmin(ctx, client, - ['policy', '--bucket', bucket.name, '--object', key.key], - check_status=True) - - acl = key.get_xml_acl() - assert acl == out.strip('\n') - - # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key_name = ['eight', 'nine', 'ten', 'eleven'] - for i in range(4): - key = boto.s3.key.Key(bucket) - key.set_contents_from_string(key_name[i]) - - (err, out) = rgwadmin(ctx, client, - ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'], - check_status=True) - - # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds' - caps='user=read' - (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps]) - - assert out['caps'][0]['perm'] == 'read' - - # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds' - (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps]) - - assert not out['caps'] - - # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - - (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1]) - assert err - - # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('twelve') - - (err, out) = rgwadmin(ctx, client, - ['user', 'rm', '--uid', user1, '--purge-data' ], - check_status=True) - - # TESTCASE 'rm-user3','user','rm','deleted user','fails' - (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1]) - assert err - - # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule' - # - - (err, out) = rgwadmin(ctx, client, ['zone', 'get']) - orig_placement_pools = len(out['placement_pools']) - - # removed this test, it is not correct to assume that zone has default placement, it really - # depends on how we set it up before - # - # assert len(out) > 0 - # assert len(out['placement_pools']) == 1 - - # default_rule = out['placement_pools'][0] - # assert default_rule['key'] == 'default-placement' - - rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}} - - out['placement_pools'].append(rule) - - (err, out) = rgwadmin(ctx, client, ['zone', 'set'], - stdin=StringIO(json.dumps(out)), - check_status=True) - - (err, out) = rgwadmin(ctx, client, ['zone', 'get']) - assert len(out) > 0 - assert len(out['placement_pools']) == orig_placement_pools + 1 diff --git a/teuthology/task/radosgw_admin_rest.py b/teuthology/task/radosgw_admin_rest.py deleted file mode 100644 index 866ff4f10e..0000000000 --- a/teuthology/task/radosgw_admin_rest.py +++ /dev/null @@ -1,678 +0,0 @@ -""" -Run a series of rgw admin commands through the rest interface. - -The test cases in this file have been annotated for inventory. -To extract the inventory (in csv format) use the command: - - grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' - -""" -from cStringIO import StringIO -import logging -import json - -import boto.exception -import boto.s3.connection -import boto.s3.acl - -import requests -import time - -from boto.connection import AWSAuthConnection -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def successful_ops(out): - """ - Extract successful operations - :param out: list - """ - summary = out['summary'] - if len(summary) == 0: - return 0 - entry = summary[0] - return entry['total']['successful_ops'] - -def rgwadmin(ctx, client, cmd): - """ - Perform rgw admin command - - :param client: client - :param cmd: command to execute. - :return: command exit status, json result. - """ - log.info('radosgw-admin: %s' % cmd) - testdir = teuthology.get_testdir(ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '--log-to-stderr', - '--format', 'json', - ] - pre.extend(cmd) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - proc = remote.run( - args=pre, - check_status=False, - stdout=StringIO(), - stderr=StringIO(), - ) - r = proc.exitstatus - out = proc.stdout.getvalue() - j = None - if not r and out != '': - try: - j = json.loads(out) - log.info(' json result: %s' % j) - except ValueError: - j = out - log.info(' raw result: %s' % j) - return (r, j) - - -def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False): - """ - perform a rest command - """ - log.info('radosgw-admin-rest: %s %s' % (cmd, params)) - put_cmds = ['create', 'link', 'add'] - post_cmds = ['unlink', 'modify'] - delete_cmds = ['trim', 'rm', 'process'] - get_cmds = ['check', 'info', 'show', 'list'] - - bucket_sub_resources = ['object', 'policy', 'index'] - user_sub_resources = ['subuser', 'key', 'caps'] - zone_sub_resources = ['pool', 'log', 'garbage'] - - def get_cmd_method_and_handler(cmd): - """ - Get the rest command and handler from information in cmd and - from the imported requests object. - """ - if cmd[1] in put_cmds: - return 'PUT', requests.put - elif cmd[1] in delete_cmds: - return 'DELETE', requests.delete - elif cmd[1] in post_cmds: - return 'POST', requests.post - elif cmd[1] in get_cmds: - return 'GET', requests.get - - def get_resource(cmd): - """ - Get the name of the resource from information in cmd. - """ - if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources: - if cmd[0] == 'bucket': - return 'bucket', '' - else: - return 'bucket', cmd[0] - elif cmd[0] == 'user' or cmd[0] in user_sub_resources: - if cmd[0] == 'user': - return 'user', '' - else: - return 'user', cmd[0] - elif cmd[0] == 'usage': - return 'usage', '' - elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources: - if cmd[0] == 'zone': - return 'zone', '' - else: - return 'zone', cmd[0] - - def build_admin_request(conn, method, resource = '', headers=None, data='', - query_args=None, params=None): - """ - Build an administative request adapted from the build_request() - method of boto.connection - """ - - path = conn.calling_format.build_path_base('admin', resource) - auth_path = conn.calling_format.build_auth_path('admin', resource) - host = conn.calling_format.build_host(conn.server_name(), 'admin') - if query_args: - path += '?' + query_args - boto.log.debug('path=%s' % path) - auth_path += '?' + query_args - boto.log.debug('auth_path=%s' % auth_path) - return AWSAuthConnection.build_base_http_request(conn, method, path, - auth_path, params, headers, data, host) - - method, handler = get_cmd_method_and_handler(cmd) - resource, query_args = get_resource(cmd) - request = build_admin_request(connection, method, resource, - query_args=query_args, headers=headers) - - url = '{protocol}://{host}{path}'.format(protocol=request.protocol, - host=request.host, path=request.path) - - request.authorize(connection=connection) - result = handler(url, params=params, headers=request.headers) - - if raw: - log.info(' text result: %s' % result.txt) - return result.status_code, result.txt - else: - log.info(' json result: %s' % result.json()) - return result.status_code, result.json() - - -def task(ctx, config): - """ - Test radosgw-admin functionality through the RESTful interface - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - # just use the first client... - client = clients[0] - - ## - admin_user = 'ada' - admin_display_name = 'Ms. Admin User' - admin_access_key = 'MH1WC2XQ1S8UISFDZC8W' - admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG' - admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write' - - user1 = 'foo' - user2 = 'fud' - subuser1 = 'foo:foo1' - subuser2 = 'foo:foo2' - display_name1 = 'Foo' - display_name2 = 'Fud' - email = 'foo@foo.com' - access_key = '9te6NH5mcdcq0Tc5i8i1' - secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu' - access_key2 = 'p5YnriCv1nAtykxBrupQ' - secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh' - swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL' - swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy' - - bucket_name = 'myfoo' - - # legend (test cases can be easily grep-ed out) - # TESTCASE 'testname','object','method','operation','assertion' - # TESTCASE 'create-admin-user','user','create','administrative user','succeeds' - (err, out) = rgwadmin(ctx, client, [ - 'user', 'create', - '--uid', admin_user, - '--display-name', admin_display_name, - '--access-key', admin_access_key, - '--secret', admin_secret_key, - '--max-buckets', '0', - '--caps', admin_caps - ]) - logging.error(out) - logging.error(err) - assert not err - - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - remote_host = remote.name.split('@')[1] - admin_conn = boto.s3.connection.S3Connection( - aws_access_key_id=admin_access_key, - aws_secret_access_key=admin_secret_key, - is_secure=False, - port=7280, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # TESTCASE 'info-nosuch','user','info','non-existent user','fails' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1}) - assert ret == 404 - - # TESTCASE 'create-ok','user','create','w/all valid info','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['user', 'create'], - {'uid' : user1, - 'display-name' : display_name1, - 'email' : email, - 'access-key' : access_key, - 'secret-key' : secret_key, - 'max-buckets' : '4' - }) - - assert ret == 200 - - # TESTCASE 'info-existing','user','info','existing user','returns correct info' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - - assert out['user_id'] == user1 - assert out['email'] == email - assert out['display_name'] == display_name1 - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - assert not out['suspended'] - - # TESTCASE 'suspend-ok','user','suspend','active user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True}) - assert ret == 200 - - # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert out['suspended'] - - # TESTCASE 're-enable','user','enable','suspended user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'}) - assert not err - - # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert not out['suspended'] - - # TESTCASE 'add-keys','key','create','w/valid info','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'create'], - {'uid' : user1, - 'access-key' : access_key2, - 'secret-key' : secret_key2 - }) - - - assert ret == 200 - - # TESTCASE 'info-new-key','user','info','after key addition','returns all keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['keys']) == 2 - assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2 - assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2 - - # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'rm'], - {'uid' : user1, - 'access-key' : access_key2 - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - - assert len(out['keys']) == 1 - assert out['keys'][0]['access_key'] == access_key - assert out['keys'][0]['secret_key'] == secret_key - - # TESTCASE 'add-swift-key','key','create','swift key','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'create'], - {'subuser' : subuser1, - 'secret-key' : swift_secret1, - 'key-type' : 'swift' - }) - - assert ret == 200 - - # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['swift_keys']) == 1 - assert out['swift_keys'][0]['user'] == subuser1 - assert out['swift_keys'][0]['secret_key'] == swift_secret1 - - # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'create'], - {'subuser' : subuser2, - 'secret-key' : swift_secret2, - 'key-type' : 'swift' - }) - - assert ret == 200 - - # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out['swift_keys']) == 2 - assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2 - assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2 - - # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['key', 'rm'], - {'subuser' : subuser1, - 'key-type' :'swift' - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['swift_keys']) == 1 - - # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'rm'], - {'subuser' : subuser1 - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['subusers']) == 1 - - # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed' - (ret, out) = rgwadmin_rest(admin_conn, - ['subuser', 'rm'], - {'subuser' : subuser2, - 'key-type' : 'swift', - '{purge-keys' :True - }) - - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert len(out['swift_keys']) == 0 - assert len(out['subusers']) == 0 - - # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out) == 0 - - # connect to rgw - connection = boto.s3.connection.S3Connection( - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - is_secure=False, - port=7280, - host=remote_host, - calling_format=boto.s3.connection.OrdinaryCallingFormat(), - ) - - # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True}) - assert ret == 200 - assert len(out) == 0 - - # create a first bucket - bucket = connection.create_bucket(bucket_name) - - # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1}) - assert ret == 200 - assert len(out) == 1 - assert out[0] == bucket_name - - # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list' - (ret, out) = rgwadmin_rest(admin_conn, - ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - - assert ret == 200 - assert out['owner'] == user1 - bucket_id = out['id'] - - # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True}) - assert ret == 200 - assert len(out) == 1 - assert out[0]['id'] == bucket_id # does it return the same ID twice in a row? - - # use some space - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('one') - - # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - assert ret == 200 - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 1 - assert out['usage']['rgw.main']['size_kb'] > 0 - - # reclaim it - key.delete() - - # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name}) - - assert ret == 200 - - # create a second user to link the bucket to - (ret, out) = rgwadmin_rest(admin_conn, - ['user', 'create'], - {'uid' : user2, - 'display-name' : display_name2, - 'access-key' : access_key2, - 'secret-key' : secret_key2, - 'max-buckets' : '1', - }) - - assert ret == 200 - - # try creating an object with the first user before the bucket is relinked - denied = False - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('two') - except boto.exception.S3ResponseError: - denied = True - - assert not denied - - # delete the object - key.delete() - - # link the bucket to another user - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user2, 'bucket' : bucket_name}) - - assert ret == 200 - - # try creating an object with the first user which should cause an error - key = boto.s3.key.Key(bucket) - - try: - key.set_contents_from_string('three') - except boto.exception.S3ResponseError: - denied = True - - assert denied - - # relink the bucket to the first user and delete the second user - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user1, 'bucket' : bucket_name}) - assert ret == 200 - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2}) - assert ret == 200 - - # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed' - - # upload an object - object_name = 'four' - key = boto.s3.key.Key(bucket, object_name) - key.set_contents_from_string(object_name) - - # now delete it - (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name}) - assert ret == 200 - - # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects' - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True}) - assert ret == 200 - assert out['id'] == bucket_id - assert out['usage']['rgw.main']['num_objects'] == 0 - - # create a bucket for deletion stats - useless_bucket = connection.create_bucket('useless_bucket') - useless_key = useless_bucket.new_key('useless_key') - useless_key.set_contents_from_string('useless string') - - # delete it - useless_key.delete() - useless_bucket.delete() - - # wait for the statistics to flush - time.sleep(60) - - # need to wait for all usage data to get flushed, should take up to 30 seconds - timestamp = time.time() - while time.time() - timestamp <= (20 * 60): # wait up to 20 minutes - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'}) # last operation we did is delete obj, wait for it to flush - - if successful_ops(out) > 0: - break - time.sleep(1) - - assert time.time() - timestamp <= (20 * 60) - - # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show']) - assert ret == 200 - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - total = user_summary['total'] - assert total['successful_ops'] > 0 - - # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1}) - assert ret == 200 - assert len(out['entries']) > 0 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - for entry in user_summary['categories']: - assert entry['successful_ops'] > 0 - assert user_summary['user'] == user1 - - # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds' - test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket'] - for cat in test_categories: - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat}) - assert ret == 200 - assert len(out['summary']) > 0 - user_summary = out['summary'][0] - assert user_summary['user'] == user1 - assert len(user_summary['categories']) == 1 - entry = user_summary['categories'][0] - assert entry['category'] == cat - assert entry['successful_ops'] > 0 - - # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed' - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1}) - assert ret == 200 - (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1}) - assert ret == 200 - assert len(out['entries']) == 0 - assert len(out['summary']) == 0 - - # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True}) - assert ret == 200 - - # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects' - try: - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('five') - except boto.exception.S3ResponseError as e: - assert e.status == 403 - - # TESTCASE 'user-renable2','user','enable','suspended user','succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'}) - assert ret == 200 - - # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects' - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('six') - - # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection' - - # create an object large enough to be split into multiple parts - test_string = 'foo'*10000000 - - big_key = boto.s3.key.Key(bucket) - big_key.set_contents_from_string(test_string) - - # now delete the head - big_key.delete() - - # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1}) - assert ret == 409 - - # delete should fail because ``key`` still exists - try: - bucket.delete() - except boto.exception.S3ResponseError as e: - assert e.status == 409 - - key.delete() - bucket.delete() - - # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy' - bucket = connection.create_bucket(bucket_name) - - # create an object - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('seven') - - # should be private already but guarantee it - key.set_acl('private') - - (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key}) - assert ret == 200 - - acl = key.get_xml_acl() - assert acl == out.strip('\n') - - # add another grantee by making the object public read - key.set_acl('public-read') - - (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key}) - assert ret == 200 - - acl = key.get_xml_acl() - assert acl == out.strip('\n') - - # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key_name = ['eight', 'nine', 'ten', 'eleven'] - for i in range(4): - key = boto.s3.key.Key(bucket) - key.set_contents_from_string(key_name[i]) - - (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True}) - assert ret == 200 - - # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds' - caps = 'usage=read' - (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' : user1, 'user-caps' : caps}) - assert ret == 200 - assert out[0]['perm'] == 'read' - - # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds' - (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' : user1, 'user-caps' : caps}) - assert ret == 200 - assert not out - - # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1}) - assert ret == 409 - - # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds' - bucket = connection.create_bucket(bucket_name) - key = boto.s3.key.Key(bucket) - key.set_contents_from_string('twelve') - - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True}) - assert ret == 200 - - # TESTCASE 'rm-user3','user','info','deleted user','fails' - (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1}) - assert ret == 404 - diff --git a/teuthology/task/radosgw_agent.py b/teuthology/task/radosgw_agent.py deleted file mode 100644 index 2ebccae5cb..0000000000 --- a/teuthology/task/radosgw_agent.py +++ /dev/null @@ -1,211 +0,0 @@ -""" -Run rados gateway agent in test mode -""" -import contextlib -import logging -import argparse - -from ..orchestra import run -from teuthology import misc as teuthology -import teuthology.task_util.rgw as rgw_utils - -log = logging.getLogger(__name__) - -def run_radosgw_agent(ctx, config): - """ - Run a single radosgw-agent. See task() for config format. - """ - return_list = list() - for (client, cconf) in config.items(): - # don't process entries that are not clients - if not client.startswith('client.'): - log.debug('key {data} does not start with \'client.\', moving on'.format( - data=client)) - continue - - src_client = cconf['src'] - dest_client = cconf['dest'] - - src_zone = rgw_utils.zone_for_client(ctx, src_client) - dest_zone = rgw_utils.zone_for_client(ctx, dest_client) - - log.info("source is %s", src_zone) - log.info("dest is %s", dest_zone) - - testdir = teuthology.get_testdir(ctx) - (remote,) = ctx.cluster.only(client).remotes.keys() - # figure out which branch to pull from - branch = cconf.get('force-branch', None) - if not branch: - branch = cconf.get('branch', 'master') - sha1 = cconf.get('sha1') - remote.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'git', 'clone', - '-b', branch, -# 'https://github.com/ceph/radosgw-agent.git', - 'git://ceph.com/git/radosgw-agent.git', - 'radosgw-agent.{client}'.format(client=client), - ] - ) - if sha1 is not None: - remote.run( - args=[ - 'cd', testdir, run.Raw('&&'), - run.Raw('&&'), - 'git', 'reset', '--hard', sha1, - ] - ) - remote.run( - args=[ - 'cd', testdir, run.Raw('&&'), - 'cd', 'radosgw-agent.{client}'.format(client=client), - run.Raw('&&'), - './bootstrap', - ] - ) - - src_host, src_port = rgw_utils.get_zone_host_and_port(ctx, src_client, - src_zone) - dest_host, dest_port = rgw_utils.get_zone_host_and_port(ctx, dest_client, - dest_zone) - src_access, src_secret = rgw_utils.get_zone_system_keys(ctx, src_client, - src_zone) - dest_access, dest_secret = rgw_utils.get_zone_system_keys(ctx, dest_client, - dest_zone) - sync_scope = cconf.get('sync-scope', None) - port = cconf.get('port', 8000) - daemon_name = '{host}.{port}.syncdaemon'.format(host=remote.name, port=port) - in_args=[ - 'daemon-helper', - 'kill', - '{tdir}/radosgw-agent.{client}/radosgw-agent'.format(tdir=testdir, - client=client), - '-v', - '--src-access-key', src_access, - '--src-secret-key', src_secret, - '--source', "http://{addr}:{port}".format(addr=src_host, port=src_port), - '--dest-access-key', dest_access, - '--dest-secret-key', dest_secret, - '--max-entries', str(cconf.get('max-entries', 1000)), - '--log-file', '{tdir}/archive/rgw_sync_agent.{client}.log'.format( - tdir=testdir, - client=client), - '--object-sync-timeout', '30', - ] - - if cconf.get('metadata-only', False): - in_args.append('--metadata-only') - - # the test server and full/incremental flags are mutually exclusive - if sync_scope is None: - in_args.append('--test-server-host') - in_args.append('0.0.0.0') - in_args.append('--test-server-port') - in_args.append(str(port)) - log.debug('Starting a sync test server on {client}'.format(client=client)) - # Stash the radosgw-agent server / port # for use by subsequent tasks - ctx.radosgw_agent.endpoint = (client, str(port)) - else: - in_args.append('--sync-scope') - in_args.append(sync_scope) - log.debug('Starting a {scope} sync on {client}'.format(scope=sync_scope,client=client)) - - # positional arg for destination must come last - in_args.append("http://{addr}:{port}".format(addr=dest_host, - port=dest_port)) - - return_list.append((client, remote.run( - args=in_args, - wait=False, - stdin=run.PIPE, - logger=log.getChild(daemon_name), - ))) - return return_list - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run radosgw-agents in test mode. - - Configuration is clients to run the agents on, with settings for - source client, destination client, and port to listen on. Binds - to 0.0.0.0. Port defaults to 8000. This must be run on clients - that have the correct zone root pools and rgw zone set in - ceph.conf, or the task cannot read the region information from the - cluster. - - By default, this task will start an HTTP server that will trigger full - or incremental syncs based on requests made to it. - Alternatively, a single full sync can be triggered by - specifying 'sync-scope: full' or a loop of incremental syncs can be triggered - by specifying 'sync-scope: incremental' (the loop will sleep - '--incremental-sync-delay' seconds between each sync, default is 30 seconds). - - By default, both data and metadata are synced. To only sync - metadata, for example because you want to sync between regions, - set metadata-only: true. - - An example:: - - tasks: - - ceph: - conf: - client.0: - rgw zone = foo - rgw zone root pool = .root.pool - client.1: - rgw zone = bar - rgw zone root pool = .root.pool2 - - rgw: # region configuration omitted for brevity - - radosgw-agent: - client.0: - branch: wip-next-feature-branch - src: client.0 - dest: client.1 - sync-scope: full - metadata-only: true - # port: 8000 (default) - client.1: - src: client.1 - dest: client.0 - port: 8001 - """ - assert isinstance(config, dict), 'rgw_sync_agent requires a dictionary config' - log.debug("config is %s", config) - - overrides = ctx.config.get('overrides', {}) - # merge each client section, but only if it exists in config since there isn't - # a sensible default action for this task - for client in config.iterkeys(): - if config[client]: - log.debug('config[{client}]: {data}'.format(client=client, data=config[client])) - teuthology.deep_merge(config[client], overrides.get('radosgw-agent', {})) - - ctx.radosgw_agent = argparse.Namespace() - ctx.radosgw_agent.config = config - - procs = run_radosgw_agent(ctx, config) - - ctx.radosgw_agent.procs = procs - - try: - yield - finally: - testdir = teuthology.get_testdir(ctx) - try: - for client, proc in procs: - log.info("shutting down sync agent on %s", client) - proc.stdin.close() - proc.wait() - finally: - for client, proc in procs: - ctx.cluster.only(client).run( - args=[ - 'rm', '-rf', - '{tdir}/radosgw-agent.{client}'.format(tdir=testdir, - client=client) - ] - ) diff --git a/teuthology/task/rbd.py b/teuthology/task/rbd.py deleted file mode 100644 index fb93a0599d..0000000000 --- a/teuthology/task/rbd.py +++ /dev/null @@ -1,512 +0,0 @@ -""" -Rbd testing task -""" -import contextlib -import logging -import os - -from cStringIO import StringIO -from ..orchestra import run -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.parallel import parallel -from teuthology.task.common_fs_utils import generic_mkfs -from teuthology.task.common_fs_utils import generic_mount -from teuthology.task.common_fs_utils import default_image_name - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def create_image(ctx, config): - """ - Create an rbd image. - - For example:: - - tasks: - - ceph: - - rbd.create_image: - client.0: - image_name: testimage - image_size: 100 - image_format: 1 - client.1: - - Image size is expressed as a number of megabytes; default value - is 10240. - - Image format value must be either 1 or 2; default value is 1. - - """ - assert isinstance(config, dict) or isinstance(config, list), \ - "task create_image only supports a list or dictionary for configuration" - - if isinstance(config, dict): - images = config.items() - else: - images = [(role, None) for role in config] - - testdir = teuthology.get_testdir(ctx) - for role, properties in images: - if properties is None: - properties = {} - name = properties.get('image_name', default_image_name(role)) - size = properties.get('image_size', 10240) - fmt = properties.get('image_format', 1) - (remote,) = ctx.cluster.only(role).remotes.keys() - log.info('Creating image {name} with size {size}'.format(name=name, - size=size)) - args = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'create', - '--size', str(size), - name, - ] - # omit format option if using the default (format 1) - # since old versions of don't support it - if int(fmt) != 1: - args += ['--format', str(fmt)] - remote.run(args=args) - try: - yield - finally: - log.info('Deleting rbd images...') - for role, properties in images: - if properties is None: - properties = {} - name = properties.get('image_name', default_image_name(role)) - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'rm', - name, - ], - ) - -@contextlib.contextmanager -def modprobe(ctx, config): - """ - Load the rbd kernel module.. - - For example:: - - tasks: - - ceph: - - rbd.create_image: [client.0] - - rbd.modprobe: [client.0] - """ - log.info('Loading rbd kernel module...') - for role in config: - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'sudo', - 'modprobe', - 'rbd', - ], - ) - try: - yield - finally: - log.info('Unloading rbd kernel module...') - for role in config: - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'sudo', - 'modprobe', - '-r', - 'rbd', - # force errors to be ignored; necessary if more - # than one device was created, which may mean - # the module isn't quite ready to go the first - # time through. - run.Raw('||'), - 'true', - ], - ) - -@contextlib.contextmanager -def dev_create(ctx, config): - """ - Map block devices to rbd images. - - For example:: - - tasks: - - ceph: - - rbd.create_image: [client.0] - - rbd.modprobe: [client.0] - - rbd.dev_create: - client.0: testimage.client.0 - """ - assert isinstance(config, dict) or isinstance(config, list), \ - "task dev_create only supports a list or dictionary for configuration" - - if isinstance(config, dict): - role_images = config.items() - else: - role_images = [(role, None) for role in config] - - log.info('Creating rbd block devices...') - - testdir = teuthology.get_testdir(ctx) - - for role, image in role_images: - if image is None: - image = default_image_name(role) - (remote,) = ctx.cluster.only(role).remotes.keys() - - remote.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '--user', role.rsplit('.')[-1], - '-p', 'rbd', - 'map', - image, - run.Raw('&&'), - # wait for the symlink to be created by udev - 'while', 'test', '!', '-e', '/dev/rbd/rbd/{image}'.format(image=image), run.Raw(';'), 'do', - 'sleep', '1', run.Raw(';'), - 'done', - ], - ) - try: - yield - finally: - log.info('Unmapping rbd devices...') - for role, image in role_images: - if image is None: - image = default_image_name(role) - (remote,) = ctx.cluster.only(role).remotes.keys() - remote.run( - args=[ - 'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir), - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rbd', - '-p', 'rbd', - 'unmap', - '/dev/rbd/rbd/{imgname}'.format(imgname=image), - run.Raw('&&'), - # wait for the symlink to be deleted by udev - 'while', 'test', '-e', '/dev/rbd/rbd/{image}'.format(image=image), - run.Raw(';'), - 'do', - 'sleep', '1', run.Raw(';'), - 'done', - ], - ) - - -def rbd_devname_rtn(ctx, image): - return '/dev/rbd/rbd/{image}'.format(image=image) - -def canonical_path(ctx, role, path): - """ - Determine the canonical path for a given path on the host - representing the given role. A canonical path contains no - . or .. components, and includes no symbolic links. - """ - version_fp = StringIO() - ctx.cluster.only(role).run( - args=[ 'readlink', '-f', path ], - stdout=version_fp, - ) - canonical_path = version_fp.getvalue().rstrip('\n') - version_fp.close() - return canonical_path - -@contextlib.contextmanager -def run_xfstests(ctx, config): - """ - Run xfstests over specified devices. - - Warning: both the test and scratch devices specified will be - overwritten. Normally xfstests modifies (but does not destroy) - the test device, but for now the run script used here re-makes - both filesystems. - - Note: Only one instance of xfstests can run on a single host at - a time, although this is not enforced. - - This task in its current form needs some improvement. For - example, it assumes all roles provided in the config are - clients, and that the config provided is a list of key/value - pairs. For now please use the xfstests() interface, below. - - For example:: - - tasks: - - ceph: - - rbd.run_xfstests: - client.0: - count: 2 - test_dev: 'test_dev' - scratch_dev: 'scratch_dev' - fs_type: 'xfs' - tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015' - randomize: true - """ - with parallel() as p: - for role, properties in config.items(): - p.spawn(run_xfstests_one_client, ctx, role, properties) - yield - -def run_xfstests_one_client(ctx, role, properties): - """ - Spawned routine to handle xfs tests for a single client - """ - testdir = teuthology.get_testdir(ctx) - try: - count = properties.get('count') - test_dev = properties.get('test_dev') - assert test_dev is not None, \ - "task run_xfstests requires test_dev to be defined" - test_dev = canonical_path(ctx, role, test_dev) - - scratch_dev = properties.get('scratch_dev') - assert scratch_dev is not None, \ - "task run_xfstests requires scratch_dev to be defined" - scratch_dev = canonical_path(ctx, role, scratch_dev) - - fs_type = properties.get('fs_type') - tests = properties.get('tests') - randomize = properties.get('randomize') - - (remote,) = ctx.cluster.only(role).remotes.keys() - - # Fetch the test script - test_root = teuthology.get_testdir(ctx) - test_script = 'run_xfstests_krbd.sh' - test_path = os.path.join(test_root, test_script) - - git_branch = 'master' - test_url = 'https://raw.github.com/ceph/ceph/{branch}/qa/{script}'.format(branch=git_branch, script=test_script) - - log.info('Fetching {script} for {role} from {url}'.format(script=test_script, - role=role, - url=test_url)) - args = [ 'wget', '-O', test_path, '--', test_url ] - remote.run(args=args) - - log.info('Running xfstests on {role}:'.format(role=role)) - log.info(' iteration count: {count}:'.format(count=count)) - log.info(' test device: {dev}'.format(dev=test_dev)) - log.info(' scratch device: {dev}'.format(dev=scratch_dev)) - log.info(' using fs_type: {fs_type}'.format(fs_type=fs_type)) - log.info(' tests to run: {tests}'.format(tests=tests)) - log.info(' randomize: {randomize}'.format(randomize=randomize)) - - # Note that the device paths are interpreted using - # readlink -f in order to get their canonical - # pathname (so it matches what the kernel remembers). - args = [ - '/usr/bin/sudo', - 'TESTDIR={tdir}'.format(tdir=testdir), - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '/bin/bash', - test_path, - '-c', str(count), - '-f', fs_type, - '-t', test_dev, - '-s', scratch_dev, - ] - if randomize: - args.append('-r') - if tests: - args.extend(['--', tests]) - remote.run(args=args, logger=log.getChild(role)) - finally: - log.info('Removing {script} on {role}'.format(script=test_script, - role=role)) - remote.run(args=['rm', '-f', test_path]) - -@contextlib.contextmanager -def xfstests(ctx, config): - """ - Run xfstests over rbd devices. This interface sets up all - required configuration automatically if not otherwise specified. - Note that only one instance of xfstests can run on a single host - at a time. By default, the set of tests specified is run once. - If a (non-zero) count value is supplied, the complete set of - tests will be run that number of times. - - For example:: - - tasks: - - ceph: - # Image sizes are in MB - - rbd.xfstests: - client.0: - count: 3 - test_image: 'test_image' - test_size: 250 - test_format: 2 - scratch_image: 'scratch_image' - scratch_size: 250 - scratch_format: 1 - fs_type: 'xfs' - tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015' - randomize: true - """ - if config is None: - config = { 'all': None } - assert isinstance(config, dict) or isinstance(config, list), \ - "task xfstests only supports a list or dictionary for configuration" - if isinstance(config, dict): - config = teuthology.replace_all_with_clients(ctx.cluster, config) - runs = config.items() - else: - runs = [(role, None) for role in config] - - running_xfstests = {} - for role, properties in runs: - assert role.startswith('client.'), \ - "task xfstests can only run on client nodes" - for host, roles_for_host in ctx.cluster.remotes.items(): - if role in roles_for_host: - assert host not in running_xfstests, \ - "task xfstests allows only one instance at a time per host" - running_xfstests[host] = True - - images_config = {} - scratch_config = {} - modprobe_config = {} - image_map_config = {} - scratch_map_config = {} - xfstests_config = {} - for role, properties in runs: - if properties is None: - properties = {} - - test_image = properties.get('test_image', 'test_image.{role}'.format(role=role)) - test_size = properties.get('test_size', 2000) # 2G - test_fmt = properties.get('test_format', 1) - scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role)) - scratch_size = properties.get('scratch_size', 10000) # 10G - scratch_fmt = properties.get('scratch_format', 1) - - images_config[role] = dict( - image_name=test_image, - image_size=test_size, - image_format=test_fmt, - ) - - scratch_config[role] = dict( - image_name=scratch_image, - image_size=scratch_size, - image_format=scratch_fmt, - ) - - xfstests_config[role] = dict( - count=properties.get('count', 1), - test_dev='/dev/rbd/rbd/{image}'.format(image=test_image), - scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image), - fs_type=properties.get('fs_type', 'xfs'), - randomize=properties.get('randomize', False), - tests=properties.get('tests'), - ) - - log.info('Setting up xfstests using RBD images:') - log.info(' test ({size} MB): {image}'.format(size=test_size, - image=test_image)) - log.info(' scratch ({size} MB): {image}'.format(size=scratch_size, - image=scratch_image)) - modprobe_config[role] = None - image_map_config[role] = test_image - scratch_map_config[role] = scratch_image - - with contextutil.nested( - lambda: create_image(ctx=ctx, config=images_config), - lambda: create_image(ctx=ctx, config=scratch_config), - lambda: modprobe(ctx=ctx, config=modprobe_config), - lambda: dev_create(ctx=ctx, config=image_map_config), - lambda: dev_create(ctx=ctx, config=scratch_map_config), - lambda: run_xfstests(ctx=ctx, config=xfstests_config), - ): - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Create and mount an rbd image. - - For example, you can specify which clients to run on:: - - tasks: - - ceph: - - rbd: [client.0, client.1] - - There are a few image options:: - - tasks: - - ceph: - - rbd: - client.0: # uses defaults - client.1: - image_name: foo - image_size: 2048 - image_format: 2 - fs_type: xfs - - To use default options on all clients:: - - tasks: - - ceph: - - rbd: - all: - - To create 20GiB images and format them with xfs on all clients:: - - tasks: - - ceph: - - rbd: - all: - image_size: 20480 - fs_type: xfs - """ - if config is None: - config = { 'all': None } - norm_config = config - if isinstance(config, dict): - norm_config = teuthology.replace_all_with_clients(ctx.cluster, config) - if isinstance(norm_config, dict): - role_images = {} - for role, properties in norm_config.iteritems(): - if properties is None: - properties = {} - role_images[role] = properties.get('image_name') - else: - role_images = norm_config - - log.debug('rbd config is: %s', norm_config) - - with contextutil.nested( - lambda: create_image(ctx=ctx, config=norm_config), - lambda: modprobe(ctx=ctx, config=norm_config), - lambda: dev_create(ctx=ctx, config=role_images), - lambda: generic_mkfs(ctx=ctx, config=norm_config, - devname_rtn=rbd_devname_rtn), - lambda: generic_mount(ctx=ctx, config=role_images, - devname_rtn=rbd_devname_rtn), - ): - yield diff --git a/teuthology/task/rbd_fsx.py b/teuthology/task/rbd_fsx.py deleted file mode 100644 index d848a88c56..0000000000 --- a/teuthology/task/rbd_fsx.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Run fsx on an rbd image -""" -import contextlib -import logging - -from teuthology.parallel import parallel -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run fsx on an rbd image. - - Currently this requires running as client.admin - to create a pool. - - Specify which clients to run on as a list:: - - tasks: - ceph: - rbd_fsx: - clients: [client.0, client.1] - - You can optionally change some properties of fsx: - - tasks: - ceph: - rbd_fsx: - clients: - seed: - ops: - size: - """ - log.info('starting rbd_fsx...') - with parallel() as p: - for role in config['clients']: - p.spawn(_run_one_client, ctx, config, role) - yield - -def _run_one_client(ctx, config, role): - """Spawned task that runs the client""" - krbd = config.get('krbd', False) - testdir = teuthology.get_testdir(ctx) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - - args = [] - if krbd: - args.append('sudo') # rbd map/unmap need privileges - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'ceph_test_librbd_fsx', - '-d', # debug output for all operations - '-W', '-R', # mmap doesn't work with rbd - '-p', str(config.get('progress_interval', 100)), # show progress - '-P', '{tdir}/archive'.format(tdir=testdir), - '-r', str(config.get('readbdy',1)), - '-w', str(config.get('writebdy',1)), - '-t', str(config.get('truncbdy',1)), - '-h', str(config.get('holebdy',1)), - '-l', str(config.get('size', 250000000)), - '-S', str(config.get('seed', 0)), - '-N', str(config.get('ops', 1000)), - ]) - if krbd: - args.append('-K') # -K enables krbd mode - if config.get('direct_io', False): - args.append('-Z') # -Z use direct IO - if not config.get('randomized_striping', True): - args.append('-U') # -U disables randomized striping - if not config.get('punch_holes', True): - args.append('-H') # -H disables discard ops - args.extend([ - 'pool_{pool}'.format(pool=role), - 'image_{image}'.format(image=role), - ]) - - remote.run(args=args) diff --git a/teuthology/task/recovery_bench.py b/teuthology/task/recovery_bench.py deleted file mode 100644 index 1984b97d31..0000000000 --- a/teuthology/task/recovery_bench.py +++ /dev/null @@ -1,208 +0,0 @@ -""" -Recovery system benchmarking -""" -from cStringIO import StringIO - -import contextlib -import gevent -import json -import logging -import random -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Benchmark the recovery system. - - Generates objects with smalliobench, runs it normally to get a - baseline performance measurement, then marks an OSD out and reruns - to measure performance during recovery. - - The config should be as follows: - - recovery_bench: - duration: - num_objects: - io_size: - - example: - - tasks: - - ceph: - - recovery_bench: - duration: 60 - num_objects: 500 - io_size: 4096 - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'recovery_bench task only accepts a dict for configuration' - - log.info('Beginning recovery bench...') - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - while len(manager.get_osd_status()['up']) < num_osds: - manager.sleep(10) - - bench_proc = RecoveryBencher( - manager, - config, - ) - try: - yield - finally: - log.info('joining recovery bencher') - bench_proc.do_join() - -class RecoveryBencher: - """ - RecoveryBencher - """ - def __init__(self, manager, config): - self.ceph_manager = manager - self.ceph_manager.wait_for_clean() - - osd_status = self.ceph_manager.get_osd_status() - self.osds = osd_status['up'] - - self.config = config - if self.config is None: - self.config = dict() - - else: - def tmp(x): - """ - Local wrapper to print value. - """ - print x - self.log = tmp - - log.info("spawning thread") - - self.thread = gevent.spawn(self.do_bench) - - def do_join(self): - """ - Join the recovery bencher. This is called after the main - task exits. - """ - self.thread.get() - - def do_bench(self): - """ - Do the benchmarking. - """ - duration = self.config.get("duration", 60) - num_objects = self.config.get("num_objects", 500) - io_size = self.config.get("io_size", 4096) - - osd = str(random.choice(self.osds)) - (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys() - - testdir = teuthology.get_testdir(self.ceph_manager.ctx) - - # create the objects - osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench'.format(tdir=testdir), - '--use-prefix', 'recovery_bench', - '--init-only', '1', - '--num-objects', str(num_objects), - '--io-size', str(io_size), - ], - wait=True, - ) - - # baseline bench - log.info('non-recovery (baseline)') - p = osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench', - '--use-prefix', 'recovery_bench', - '--do-not-init', '1', - '--duration', str(duration), - '--io-size', str(io_size), - ], - stdout=StringIO(), - stderr=StringIO(), - wait=True, - ) - self.process_samples(p.stderr.getvalue()) - - self.ceph_manager.raw_cluster_cmd('osd', 'out', osd) - time.sleep(5) - - # recovery bench - log.info('recovery active') - p = osd_remote.run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'smalliobench', - '--use-prefix', 'recovery_bench', - '--do-not-init', '1', - '--duration', str(duration), - '--io-size', str(io_size), - ], - stdout=StringIO(), - stderr=StringIO(), - wait=True, - ) - self.process_samples(p.stderr.getvalue()) - - self.ceph_manager.raw_cluster_cmd('osd', 'in', osd) - - def process_samples(self, input): - """ - Extract samples from the input and process the results - - :param input: input lines in JSON format - """ - lat = {} - for line in input.split('\n'): - try: - sample = json.loads(line) - samples = lat.setdefault(sample['type'], []) - samples.append(float(sample['latency'])) - except Exception: - pass - - for type in lat: - samples = lat[type] - samples.sort() - - num = len(samples) - - # median - if num & 1 == 1: # odd number of samples - median = samples[num / 2] - else: - median = (samples[num / 2] + samples[num / 2 - 1]) / 2 - - # 99% - ninety_nine = samples[int(num * 0.99)] - - log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine)) diff --git a/teuthology/task/rep_lost_unfound_delete.py b/teuthology/task/rep_lost_unfound_delete.py deleted file mode 100644 index cac8d4f198..0000000000 --- a/teuthology/task/rep_lost_unfound_delete.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -Lost_unfound -""" -import logging -import ceph_manager -from teuthology import misc as teuthology -from teuthology.task_util.rados import rados - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test handling of lost objects. - - A pretty rigid cluseter is brought up andtested by this task - """ - POOL = 'unfounddel_pool' - if config is None: - config = {} - assert isinstance(config, dict), \ - 'lost_unfound task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < 3: - manager.sleep(10) - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_clean() - - manager.create_pool(POOL) - - # something that is always there - dummyfile = '/etc/fstab' - - # take an osd out until the very end - manager.kill_osd(2) - manager.mark_down_osd(2) - manager.mark_out_osd(2) - - # kludge to make sure they get a map - rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.wait_for_recovery() - - # create old objects - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) - - # delay recovery, and make the pg log very long (to prevent backfill) - manager.raw_cluster_cmd( - 'tell', 'osd.1', - 'injectargs', - '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' - ) - - manager.kill_osd(0) - manager.mark_down_osd(0) - - for f in range(1, 10): - rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) - rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) - - # bring osd.0 back up, let it peer, but don't replicate the new - # objects... - log.info('osd.0 command_args is %s' % 'foo') - log.info(ctx.daemons.get_daemon('osd', 0).command_args) - ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ - '--osd-recovery-delay-start', '1000' - ]) - manager.revive_osd(0) - manager.mark_in_osd(0) - manager.wait_till_osd_is_up(0) - - manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.wait_till_active() - - # take out osd.1 and the only copy of those objects. - manager.kill_osd(1) - manager.mark_down_osd(1) - manager.mark_out_osd(1) - manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') - - # bring up osd.2 so that things would otherwise, in theory, recovery fully - manager.revive_osd(2) - manager.mark_in_osd(2) - manager.wait_till_osd_is_up(2) - - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_till_active() - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - - # verify that there are unfound objects - unfound = manager.get_num_unfound_objects() - log.info("there are %d unfound objects" % unfound) - assert unfound - - # mark stuff lost - pgs = manager.get_pg_stats() - for pg in pgs: - if pg['stat_sum']['num_objects_unfound'] > 0: - primary = 'osd.%d' % pg['acting'][0] - - # verify that i can list them direct from the osd - log.info('listing missing/lost in %s state %s', pg['pgid'], - pg['state']); - m = manager.list_pg_missing(pg['pgid']) - #log.info('%s' % m) - assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] - num_unfound=0 - for o in m['objects']: - if len(o['locations']) == 0: - num_unfound += 1 - assert m['num_unfound'] == num_unfound - - log.info("reverting unfound in %s on %s", pg['pgid'], primary) - manager.raw_cluster_cmd('pg', pg['pgid'], - 'mark_unfound_lost', 'delete') - else: - log.info("no unfound in %s", pg['pgid']) - - manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') - manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') - manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') - manager.wait_for_recovery() - - # verify result - for f in range(1, 10): - err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) - assert err - err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) - assert err - - # see if osd.1 can cope - manager.revive_osd(1) - manager.mark_in_osd(1) - manager.wait_till_osd_is_up(1) - manager.wait_for_clean() diff --git a/teuthology/task/repair_test.py b/teuthology/task/repair_test.py deleted file mode 100644 index 340163f8a4..0000000000 --- a/teuthology/task/repair_test.py +++ /dev/null @@ -1,297 +0,0 @@ -""" -Test pool repairing after objects are damaged. -""" -import logging -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def setup(ctx, config): - """ - Create the repair test pool. - """ - ctx.manager.wait_for_clean() - ctx.manager.create_pool("repair_test_pool", 1) - return "repair_test_pool" - -def teardown(ctx, config, pool): - """ - Remove the repair test pool. - """ - ctx.manager.remove_pool(pool) - ctx.manager.wait_for_clean() - -def run_test(ctx, config, test): - """ - Setup a test pool, run the test, and clean up afterwards. - - :param test: function passed in, called to run the test. - """ - s = setup(ctx, config) - test(ctx, config, s) - teardown(ctx, config, s) - -def choose_primary(ctx): - """ - Select a primary for the next test. This routine is typically passed to - as a 'chooser function' - """ - def ret(pool, num): - """ - Return primary to test on. - """ - log.info("Choosing primary") - return ctx.manager.get_pg_primary(pool, num) - return ret - -def choose_replica(ctx): - """ - Select a replica for the next test. This routine is typically passed to - as a 'chooser function' - """ - def ret(pool, num): - """ - Return replica to test on. - """ - log.info("Choosing replica") - return ctx.manager.get_pg_replica(pool, num) - return ret - -def trunc(ctx): - """ - Truncate an object in the pool. This function is typically passed as a - 'corrupter function' - """ - def ret(osd, pool, obj): - """ - truncate an object - """ - log.info("truncating object") - return ctx.manager.osd_admin_socket( - osd, - ['truncobj', pool, obj, '1']) - return ret - -def dataerr(ctx): - """ - Generate an error on an object in the pool. This function is typically - passed as a 'corrupter function' - """ - def ret(osd, pool, obj): - """ - cause an error in the data - """ - log.info("injecting data err on object") - return ctx.manager.osd_admin_socket( - osd, - ['injectdataerr', pool, obj]) - return ret - -def mdataerr(ctx): - """ - Generate an mdata error on an object in the pool. This function is - typically passed as a 'corrupter function' - """ - def ret(osd, pool, obj): - """ - cause an error in the mdata - """ - log.info("injecting mdata err on object") - return ctx.manager.osd_admin_socket( - osd, - ['injectmdataerr', pool, obj]) - return ret - -def omaperr(ctx): - """ - Cause data corruption by injecting omap errors into a pool. - """ - def ret(osd, pool, obj): - """ - Cause an omap error. - """ - log.info("injecting omap err on object") - return ctx.manager.osd_admin_socket(osd, ['setomapval', pool, obj, 'badkey', 'badval']); - return ret - -def gen_repair_test_1(corrupter, chooser, scrub_type): - """ - Repair test. Wrapper for the internal ret function. - - The internal ret function creates an object in the pool, corrupts it, - scrubs it, and verifies that the pool is inconsistent. It then repairs - the pool, rescrubs it, and verifies that the pool is consistent - - :param corrupter: error generating function (truncate, data-error, or - meta-data error, for example). - :param chooser: osd type chooser (primary or replica) - :param scrub_type: regular scrub or deep-scrub - """ - def ret(ctx, config, pool): - """ - :param pool: repair test pool - """ - log.info("starting repair test type 1") - victim_osd = chooser(pool, 0) - - # create object - log.info("doing put") - ctx.manager.do_put(pool, 'repair_test_obj', '/etc/hosts') - - # corrupt object - log.info("corrupting object") - corrupter(victim_osd, pool, 'repair_test_obj') - - # verify inconsistent - log.info("scrubbing") - ctx.manager.do_pg_scrub(pool, 0, scrub_type) - - assert ctx.manager.pg_inconsistent(pool, 0) - - # repair - log.info("repairing") - ctx.manager.do_pg_scrub(pool, 0, "repair") - - log.info("re-scrubbing") - ctx.manager.do_pg_scrub(pool, 0, scrub_type) - - # verify consistent - assert not ctx.manager.pg_inconsistent(pool, 0) - log.info("done") - return ret - -def gen_repair_test_2(chooser): - """ - Repair test. Wrapper for the internal ret function. - - The internal ret function first creates a set of objects and - sets the omap value. It then corrupts an object, does both a scrub - and a deep-scrub, and then corrupts more objects. After that, it - repairs the pool and makes sure that the pool is consistent some - time after a deep-scrub. - - :param chooser: primary or replica selection routine. - """ - def ret(ctx, config, pool): - """ - :param pool: repair test pool. - """ - log.info("starting repair test type 2") - victim_osd = chooser(pool, 0) - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - # create object - log.info("doing put and setomapval") - ctx.manager.do_put(pool, 'file1', '/etc/hosts') - ctx.manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1', 'key', 'val']) - ctx.manager.do_put(pool, 'file2', '/etc/hosts') - ctx.manager.do_put(pool, 'file3', '/etc/hosts') - ctx.manager.do_put(pool, 'file4', '/etc/hosts') - ctx.manager.do_put(pool, 'file5', '/etc/hosts') - ctx.manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5', 'key', 'val']) - ctx.manager.do_put(pool, 'file6', '/etc/hosts') - - # corrupt object - log.info("corrupting object") - omaperr(ctx)(victim_osd, pool, 'file1') - - # verify inconsistent - log.info("scrubbing") - ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub') - - assert ctx.manager.pg_inconsistent(pool, 0) - - # Regression test for bug #4778, should still - # be inconsistent after scrub - ctx.manager.do_pg_scrub(pool, 0, 'scrub') - - assert ctx.manager.pg_inconsistent(pool, 0) - - # Additional corruptions including 2 types for file1 - log.info("corrupting more objects") - dataerr(ctx)(victim_osd, pool, 'file1') - mdataerr(ctx)(victim_osd, pool, 'file2') - trunc(ctx)(victim_osd, pool, 'file3') - omaperr(ctx)(victim_osd, pool, 'file6') - - # see still inconsistent - log.info("scrubbing") - ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub') - - assert ctx.manager.pg_inconsistent(pool, 0) - - # repair - log.info("repairing") - ctx.manager.do_pg_scrub(pool, 0, "repair") - - # Let repair clear inconsistent flag - time.sleep(10) - - # verify consistent - assert not ctx.manager.pg_inconsistent(pool, 0) - - # In the future repair might determine state of - # inconsistency itself, verify with a deep-scrub - log.info("scrubbing") - ctx.manager.do_pg_scrub(pool, 0, 'deep-scrub') - - # verify consistent - assert not ctx.manager.pg_inconsistent(pool, 0) - - log.info("done") - return ret - -def task(ctx, config): - """ - Test [deep] repair in several situations: - Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica] - - The config should be as follows: - - Must include the log-whitelist below - Must enable filestore_debug_inject_read_err config - - example: - - tasks: - - chef: - - install: - - ceph: - log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size'] - conf: - osd: - filestore debug inject read err: true - - repair_test: - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'repair_test task only accepts a dict for config' - - if not hasattr(ctx, 'manager'): - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - ctx.manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager') - ) - - tests = [ - gen_repair_test_1(mdataerr(ctx), choose_primary(ctx), "scrub"), - gen_repair_test_1(mdataerr(ctx), choose_replica(ctx), "scrub"), - gen_repair_test_1(dataerr(ctx), choose_primary(ctx), "deep-scrub"), - gen_repair_test_1(dataerr(ctx), choose_replica(ctx), "deep-scrub"), - gen_repair_test_1(trunc(ctx), choose_primary(ctx), "scrub"), - gen_repair_test_1(trunc(ctx), choose_replica(ctx), "scrub"), - gen_repair_test_2(choose_primary(ctx)), - gen_repair_test_2(choose_replica(ctx)) - ] - - for test in tests: - run_test(ctx, config, test) diff --git a/teuthology/task/rest_api.py b/teuthology/task/rest_api.py deleted file mode 100644 index de09df4646..0000000000 --- a/teuthology/task/rest_api.py +++ /dev/null @@ -1,183 +0,0 @@ -""" -Rest Api -""" -import logging -import contextlib -import time - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..orchestra import run -from ..orchestra.daemon import DaemonGroup - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def run_rest_api_daemon(ctx, api_clients): - """ - Wrapper starts the rest api daemons - """ - if not hasattr(ctx, 'daemons'): - ctx.daemons = DaemonGroup() - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - for rems, roles in remotes.iteritems(): - for whole_id_ in roles: - if whole_id_ in api_clients: - id_ = whole_id_[len('clients'):] - run_cmd = [ - 'sudo', - 'daemon-helper', - 'kill', - 'ceph-rest-api', - '-n', - 'client.rest{id}'.format(id=id_), ] - cl_rest_id = 'client.rest{id}'.format(id=id_) - ctx.daemons.add_daemon(rems, 'restapi', - cl_rest_id, - args=run_cmd, - logger=log.getChild(cl_rest_id), - stdin=run.PIPE, - wait=False, - ) - for i in range(1, 12): - log.info('testing for ceph-rest-api try {0}'.format(i)) - run_cmd = [ - 'wget', - '-O', - '/dev/null', - '-q', - 'http://localhost:5000/api/v0.1/status' - ] - proc = rems.run( - args=run_cmd, - check_status=False - ) - if proc.exitstatus == 0: - break - time.sleep(5) - if proc.exitstatus != 0: - raise RuntimeError('Cannot contact ceph-rest-api') - try: - yield - - finally: - """ - TO DO: destroy daemons started -- modify iter_daemons_of_role - """ - teuthology.stop_daemons_of_type(ctx, 'restapi') - -@contextlib.contextmanager -def task(ctx, config): - """ - Start up rest-api. - - To start on on all clients:: - - tasks: - - ceph: - - rest-api: - - To only run on certain clients:: - - tasks: - - ceph: - - rest-api: [client.0, client.3] - - or - - tasks: - - ceph: - - rest-api: - client.0: - client.3: - - The general flow of things here is: - 1. Find clients on which rest-api is supposed to run (api_clients) - 2. Generate keyring values - 3. Start up ceph-rest-api daemons - On cleanup: - 4. Stop the daemons - 5. Delete keyring value files. - """ - api_clients = [] - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - log.info(remotes) - if config == None: - api_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - else: - api_clients = config - log.info(api_clients) - testdir = teuthology.get_testdir(ctx) - coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) - for rems, roles in remotes.iteritems(): - for whole_id_ in roles: - if whole_id_ in api_clients: - id_ = whole_id_[len('client.'):] - keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format( - id=id_) - rems.run( - args=[ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - coverage_dir, - 'ceph-authtool', - '--create-keyring', - '--gen-key', - '--name=client.rest{id}'.format(id=id_), - '--set-uid=0', - '--cap', 'mon', 'allow *', - '--cap', 'osd', 'allow *', - '--cap', 'mds', 'allow', - keyring, - run.Raw('&&'), - 'sudo', - 'chmod', - '0644', - keyring, - ], - ) - rems.run( - args=[ - 'sudo', - 'sh', - '-c', - run.Raw("'"), - "echo", - '[client.rest{id}]'.format(id=id_), - run.Raw('>>'), - "/etc/ceph/ceph.conf", - run.Raw("'") - ] - ) - rems.run( - args=[ - 'sudo', - 'sh', - '-c', - run.Raw("'"), - 'echo', - 'restapi', - 'keyring', - '=', - '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), - run.Raw('>>'), - '/etc/ceph/ceph.conf', - run.Raw("'"), - ] - ) - rems.run( - args=[ - 'ceph', - 'auth', - 'import', - '-i', - '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), - ] - ) - with contextutil.nested( - lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients),): - yield - diff --git a/teuthology/task/restart.py b/teuthology/task/restart.py deleted file mode 100644 index 85e053a725..0000000000 --- a/teuthology/task/restart.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Daemon restart -""" -import logging -import pipes - -from teuthology import misc as teuthology -from teuthology.orchestra import run as tor - -from ..orchestra import run -log = logging.getLogger(__name__) - -def restart_daemon(ctx, config, role, id_, *args): - """ - Handle restart (including the execution of the command parameters passed) - """ - log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_)) - daemon = ctx.daemons.get_daemon(role, id_) - log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_)) - try: - daemon.wait_for_exit() - except tor.CommandFailedError as e: - log.debug('Command Failed: {e}'.format(e=e)) - if len(args) > 0: - confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])] - log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs)) - daemon.restart_with_args(confargs) - else: - log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_)) - daemon.restart() - -def get_tests(ctx, config, role, remote, testdir): - """Download restart tests""" - srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role) - - refspec = config.get('branch') - if refspec is None: - refspec = config.get('sha1') - if refspec is None: - refspec = config.get('tag') - if refspec is None: - refspec = 'HEAD' - log.info('Pulling restart qa/workunits from ref %s', refspec) - - remote.run( - logger=log.getChild(role), - args=[ - 'mkdir', '--', srcdir, - run.Raw('&&'), - 'git', - 'archive', - '--remote=git://ceph.newdream.net/git/ceph.git', - '%s:qa/workunits' % refspec, - run.Raw('|'), - 'tar', - '-C', srcdir, - '-x', - '-f-', - run.Raw('&&'), - 'cd', '--', srcdir, - run.Raw('&&'), - 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi', - run.Raw('&&'), - 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir), - run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)), - ], - ) - restarts = sorted(teuthology.get_file( - remote, - '{tdir}/restarts.list'.format(tdir=testdir)).split('\0')) - return (srcdir, restarts) - -def task(ctx, config): - """ - Execute commands and allow daemon restart with config options. - Each process executed can output to stdout restart commands of the form: - restart - This will restart the daemon . with the specified config values once - by modifying the conf file with those values, and then replacing the old conf file - once the daemon is restarted. - This task does not kill a running daemon, it assumes the daemon will abort on an - assert specified in the config. - - tasks: - - install: - - ceph: - - restart: - exec: - client.0: - - test_backtraces.py - - """ - assert isinstance(config, dict), "task kill got invalid config" - - testdir = teuthology.get_testdir(ctx) - - try: - assert 'exec' in config, "config requires exec key with : entries" - for role, task in config['exec'].iteritems(): - log.info('restart for role {r}'.format(r=role)) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - srcdir, restarts = get_tests(ctx, config, role, remote, testdir) - log.info('Running command on role %s host %s', role, remote.name) - spec = '{spec}'.format(spec=task[0]) - log.info('Restarts list: %s', restarts) - log.info('Spec is %s', spec) - to_run = [w for w in restarts if w == task or w.find(spec) != -1] - log.info('To run: %s', to_run) - for c in to_run: - log.info('Running restart script %s...', c) - args = [ - run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)), - ] - env = config.get('env') - if env is not None: - for var, val in env.iteritems(): - quoted_val = pipes.quote(val) - env_arg = '{var}={val}'.format(var=var, val=quoted_val) - args.append(run.Raw(env_arg)) - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - '{srcdir}/{c}'.format( - srcdir=srcdir, - c=c, - ), - ]) - proc = remote.run( - args=args, - stdout=tor.PIPE, - stdin=tor.PIPE, - stderr=log, - wait=False, - ) - log.info('waiting for a command from script') - while True: - l = proc.stdout.readline() - if not l or l == '': - break - log.debug('script command: {c}'.format(c=l)) - ll = l.strip() - cmd = ll.split(' ') - if cmd[0] == "done": - break - assert cmd[0] == 'restart', "script sent invalid command request to kill task" - # cmd should be: restart - # or to clear, just: restart - restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:]) - proc.stdin.writelines(['restarted\n']) - proc.stdin.flush() - try: - proc.wait() - except tor.CommandFailedError: - raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c)) - finally: - log.info('Finishing %s on %s...', task, role) - remote.run( - logger=log.getChild(role), - args=[ - 'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir, - ], - ) diff --git a/teuthology/task/rgw.py b/teuthology/task/rgw.py deleted file mode 100644 index afa464d520..0000000000 --- a/teuthology/task/rgw.py +++ /dev/null @@ -1,808 +0,0 @@ -""" -rgw routines -""" -import argparse -import contextlib -import json -import logging -import os - -from cStringIO import StringIO - -from ..orchestra import run -from teuthology import misc as teuthology -from teuthology import contextutil -from teuthology.task_util.rgw import rgwadmin -from teuthology.task_util.rados import (rados, create_ec_pool, - create_replicated_pool, - create_cache_pool) - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def create_apache_dirs(ctx, config): - """ - Remotely create apache directories. Delete when finished. - """ - log.info('Creating apache directories...') - testdir = teuthology.get_testdir(ctx) - for client in config.iterkeys(): - ctx.cluster.only(client).run( - args=[ - 'mkdir', - '-p', - '{tdir}/apache/htdocs.{client}'.format(tdir=testdir, - client=client), - '{tdir}/apache/tmp.{client}/fastcgi_sock'.format( - tdir=testdir, - client=client), - run.Raw('&&'), - 'mkdir', - '{tdir}/archive/apache.{client}'.format(tdir=testdir, - client=client), - ], - ) - try: - yield - finally: - log.info('Cleaning up apache directories...') - for client in config.iterkeys(): - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/apache/tmp.{client}'.format(tdir=testdir, - client=client), - run.Raw('&&'), - 'rmdir', - '{tdir}/apache/htdocs.{client}'.format(tdir=testdir, - client=client), - ], - ) - - for client in config.iterkeys(): - ctx.cluster.only(client).run( - args=[ - 'rmdir', - '{tdir}/apache'.format(tdir=testdir), - ], - check_status=False, # only need to remove once per host - ) - - -@contextlib.contextmanager -def ship_apache_configs(ctx, config, role_endpoints): - """ - Ship apache config and rgw.fgci to all clients. Clean up on termination - """ - assert isinstance(config, dict) - assert isinstance(role_endpoints, dict) - testdir = teuthology.get_testdir(ctx) - log.info('Shipping apache config and rgw.fcgi...') - src = os.path.join(os.path.dirname(__file__), 'apache.conf.template') - for client, conf in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - system_type = teuthology.get_system_type(remote) - if not conf: - conf = {} - idle_timeout = conf.get('idle_timeout', 30) - if system_type == 'deb': - mod_path = '/usr/lib/apache2/modules' - print_continue = 'on' - else: - mod_path = '/usr/lib64/httpd/modules' - print_continue = 'off' - host, port = role_endpoints[client] - with file(src, 'rb') as f: - conf = f.read().format( - testdir=testdir, - mod_path=mod_path, - print_continue=print_continue, - host=host, - port=port, - client=client, - idle_timeout=idle_timeout, - ) - teuthology.write_file( - remote=remote, - path='{tdir}/apache/apache.{client}.conf'.format( - tdir=testdir, - client=client), - data=conf, - ) - teuthology.write_file( - remote=remote, - path='{tdir}/apache/htdocs.{client}/rgw.fcgi'.format( - tdir=testdir, - client=client), - data="""#!/bin/sh -ulimit -c unlimited -exec radosgw -f -n {client} -k /etc/ceph/ceph.{client}.keyring --rgw-socket-path {tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock - -""".format(tdir=testdir, client=client) - ) - remote.run( - args=[ - 'chmod', - 'a=rx', - '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format(tdir=testdir, - client=client), - ], - ) - try: - yield - finally: - log.info('Removing apache config...') - for client in config.iterkeys(): - ctx.cluster.only(client).run( - args=[ - 'rm', - '-f', - '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir, - client=client), - run.Raw('&&'), - 'rm', - '-f', - '{tdir}/apache/htdocs.{client}/rgw.fcgi'.format( - tdir=testdir, - client=client), - ], - ) - - -@contextlib.contextmanager -def start_rgw(ctx, config): - """ - Start rgw on remote sites. - """ - log.info('Starting rgw...') - testdir = teuthology.get_testdir(ctx) - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - - client_config = config.get(client) - if client_config is None: - client_config = {} - log.info("rgw %s config is %s", client, client_config) - id_ = client.split('.', 1)[1] - log.info('client {client} is id {id}'.format(client=client, id=id_)) - cmd_prefix = [ - 'sudo', - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'daemon-helper', - 'term', - ] - - rgw_cmd = ['radosgw'] - - if ctx.rgw.frontend == 'apache': - rgw_cmd.extend([ - '--rgw-socket-path', - '{tdir}/apache/tmp.{client}/fastcgi_sock/rgw_sock'.format( - tdir=testdir, - client=client, - ), - ]) - elif ctx.rgw.frontend == 'civetweb': - host, port = ctx.rgw.role_endpoints[client] - rgw_cmd.extend([ - '--rgw-frontends', - 'civetweb port={port}'.format(port=port), - ]) - - rgw_cmd.extend([ - '-n', client, - '-k', '/etc/ceph/ceph.{client}.keyring'.format(client=client), - '--log-file', - '/var/log/ceph/rgw.{client}.log'.format(client=client), - '--rgw_ops_log_socket_path', - '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir, - client=client), - '--foreground', - run.Raw('|'), - 'sudo', - 'tee', - '/var/log/ceph/rgw.{client}.stdout'.format(tdir=testdir, - client=client), - run.Raw('2>&1'), - ]) - - if client_config.get('valgrind'): - cmd_prefix = teuthology.get_valgrind_args( - testdir, - client, - cmd_prefix, - client_config.get('valgrind') - ) - - run_cmd = list(cmd_prefix) - run_cmd.extend(rgw_cmd) - - ctx.daemons.add_daemon( - remote, 'rgw', client, - args=run_cmd, - logger=log.getChild(client), - stdin=run.PIPE, - wait=False, - ) - - try: - yield - finally: - teuthology.stop_daemons_of_type(ctx, 'rgw') - for client in config.iterkeys(): - ctx.cluster.only(client).run( - args=[ - 'rm', - '-f', - '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir, - client=client), - ], - ) - - -@contextlib.contextmanager -def start_apache(ctx, config): - """ - Start apache on remote sites. - """ - log.info('Starting apache...') - testdir = teuthology.get_testdir(ctx) - apaches = {} - for client in config.iterkeys(): - (remote,) = ctx.cluster.only(client).remotes.keys() - system_type = teuthology.get_system_type(remote) - if system_type == 'deb': - apache_name = 'apache2' - else: - apache_name = '/usr/sbin/httpd.worker' - proc = remote.run( - args=[ - 'adjust-ulimits', - 'daemon-helper', - 'kill', - apache_name, - '-X', - '-f', - '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir, - client=client), - ], - logger=log.getChild(client), - stdin=run.PIPE, - wait=False, - ) - apaches[client] = proc - - try: - yield - finally: - log.info('Stopping apache...') - for client, proc in apaches.iteritems(): - proc.stdin.close() - - run.wait(apaches.itervalues()) - - -def extract_user_info(client_config): - """ - Extract user info from the client config specified. Returns a dict - that includes system key information. - """ - # test if there isn't a system user or if there isn't a name for that - # user, return None - if ('system user' not in client_config or - 'name' not in client_config['system user']): - return None - - user_info = dict() - user_info['system_key'] = dict( - user=client_config['system user']['name'], - access_key=client_config['system user']['access key'], - secret_key=client_config['system user']['secret key'], - ) - return user_info - - -def extract_zone_info(ctx, client, client_config): - """ - Get zone information. - :param client: dictionary of client information - :param client_config: dictionary of client configuration information - :returns: zone extracted from client and client_config information - """ - ceph_config = ctx.ceph.conf.get('global', {}) - ceph_config.update(ctx.ceph.conf.get('client', {})) - ceph_config.update(ctx.ceph.conf.get(client, {})) - for key in ['rgw zone', 'rgw region', 'rgw zone root pool']: - assert key in ceph_config, \ - 'ceph conf must contain {key} for {client}'.format(key=key, - client=client) - region = ceph_config['rgw region'] - zone = ceph_config['rgw zone'] - zone_info = dict() - for key in ['rgw control pool', 'rgw gc pool', 'rgw log pool', - 'rgw intent log pool', 'rgw usage log pool', - 'rgw user keys pool', 'rgw user email pool', - 'rgw user swift pool', 'rgw user uid pool', - 'rgw domain root']: - new_key = key.split(' ', 1)[1] - new_key = new_key.replace(' ', '_') - - if key in ceph_config: - value = ceph_config[key] - log.debug('{key} specified in ceph_config ({val})'.format( - key=key, val=value)) - zone_info[new_key] = value - else: - zone_info[new_key] = '.' + region + '.' + zone + '.' + new_key - - index_pool = '.' + region + '.' + zone + '.' + 'index_pool' - data_pool = '.' + region + '.' + zone + '.' + 'data_pool' - data_extra_pool = '.' + region + '.' + zone + '.' + 'data_extra_pool' - - zone_info['placement_pools'] = [{'key': 'default_placement', - 'val': {'index_pool': index_pool, - 'data_pool': data_pool, - 'data_extra_pool': data_extra_pool} - }] - - # these keys are meant for the zones argument in the region info. We - # insert them into zone_info with a different format and then remove them - # in the fill_in_endpoints() method - for key in ['rgw log meta', 'rgw log data']: - if key in ceph_config: - zone_info[key] = ceph_config[key] - - # these keys are meant for the zones argument in the region info. We - # insert them into zone_info with a different format and then remove them - # in the fill_in_endpoints() method - for key in ['rgw log meta', 'rgw log data']: - if key in ceph_config: - zone_info[key] = ceph_config[key] - - return region, zone, zone_info - - -def extract_region_info(region, region_info): - """ - Extract region information from the region_info parameter, using get - to set default values. - - :param region: name of the region - :param region_info: region information (in dictionary form). - :returns: dictionary of region information set from region_info, using - default values for missing fields. - """ - assert isinstance(region_info['zones'], list) and region_info['zones'], \ - 'zones must be a non-empty list' - return dict( - name=region, - api_name=region_info.get('api name', region), - is_master=region_info.get('is master', False), - log_meta=region_info.get('log meta', False), - log_data=region_info.get('log data', False), - master_zone=region_info.get('master zone', region_info['zones'][0]), - placement_targets=region_info.get('placement targets', - [{'name': 'default_placement', - 'tags': []}]), - default_placement=region_info.get('default placement', - 'default_placement'), - ) - - -def assign_ports(ctx, config): - """ - Assign port numberst starting with port 7280. - """ - port = 7280 - role_endpoints = {} - for remote, roles_for_host in ctx.cluster.remotes.iteritems(): - for role in roles_for_host: - if role in config: - role_endpoints[role] = (remote.name.split('@')[1], port) - port += 1 - - return role_endpoints - - -def fill_in_endpoints(region_info, role_zones, role_endpoints): - """ - Iterate through the list of role_endpoints, filling in zone information - - :param region_info: region data - :param role_zones: region and zone information. - :param role_endpoints: endpoints being used - """ - for role, (host, port) in role_endpoints.iteritems(): - region, zone, zone_info, _ = role_zones[role] - host, port = role_endpoints[role] - endpoint = 'http://{host}:{port}/'.format(host=host, port=port) - # check if the region specified under client actually exists - # in region_info (it should, if properly configured). - # If not, throw a reasonable error - if region not in region_info: - raise Exception( - 'Region: {region} was specified but no corresponding' - ' entry was found under \'regions\''.format(region=region)) - - region_conf = region_info[region] - region_conf.setdefault('endpoints', []) - region_conf['endpoints'].append(endpoint) - - # this is the payload for the 'zones' field in the region field - zone_payload = dict() - zone_payload['endpoints'] = [endpoint] - zone_payload['name'] = zone - - # Pull the log meta and log data settings out of zone_info, if they - # exist, then pop them as they don't actually belong in the zone info - for key in ['rgw log meta', 'rgw log data']: - new_key = key.split(' ', 1)[1] - new_key = new_key.replace(' ', '_') - - if key in zone_info: - value = zone_info.pop(key) - else: - value = 'false' - - zone_payload[new_key] = value - - region_conf.setdefault('zones', []) - region_conf['zones'].append(zone_payload) - - -@contextlib.contextmanager -def configure_users(ctx, config, everywhere=False): - """ - Create users by remotely running rgwadmin commands using extracted - user information. - """ - log.info('Configuring users...') - - # extract the user info and append it to the payload tuple for the given - # client - for client, c_config in config.iteritems(): - if not c_config: - continue - user_info = extract_user_info(c_config) - if not user_info: - continue - - # For data sync the master zones and regions must have the - # system users of the secondary zones. To keep this simple, - # just create the system users on every client if regions are - # configured. - clients_to_create_as = [client] - if everywhere: - clients_to_create_as = config.keys() - for client_name in clients_to_create_as: - log.debug('Creating user {user} on {client}'.format( - user=user_info['system_key']['user'], client=client)) - rgwadmin(ctx, client_name, - cmd=[ - 'user', 'create', - '--uid', user_info['system_key']['user'], - '--access-key', user_info['system_key']['access_key'], - '--secret', user_info['system_key']['secret_key'], - '--display-name', user_info['system_key']['user'], - '--system', - ], - check_status=True, - ) - - yield - - -@contextlib.contextmanager -def create_nonregion_pools(ctx, config, regions): - """Create replicated or erasure coded data pools for rgw.""" - if regions: - yield - return - - log.info('creating data pools') - for client in config.keys(): - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - data_pool = '.rgw.buckets' - if ctx.rgw.ec_data_pool: - create_ec_pool(remote, data_pool, client, 64) - else: - create_replicated_pool(remote, data_pool, 64) - if ctx.rgw.cache_pools: - create_cache_pool(remote, data_pool, data_pool + '.cache', 64, - 64*1024*1024) - yield - - -@contextlib.contextmanager -def configure_regions_and_zones(ctx, config, regions, role_endpoints): - """ - Configure regions and zones from rados and rgw. - """ - if not regions: - log.debug( - 'In rgw.configure_regions_and_zones() and regions is None. ' - 'Bailing') - yield - return - - log.info('Configuring regions and zones...') - - log.debug('config is %r', config) - log.debug('regions are %r', regions) - log.debug('role_endpoints = %r', role_endpoints) - # extract the zone info - role_zones = dict([(client, extract_zone_info(ctx, client, c_config)) - for client, c_config in config.iteritems()]) - log.debug('roles_zones = %r', role_zones) - - # extract the user info and append it to the payload tuple for the given - # client - for client, c_config in config.iteritems(): - if not c_config: - user_info = None - else: - user_info = extract_user_info(c_config) - - (region, zone, zone_info) = role_zones[client] - role_zones[client] = (region, zone, zone_info, user_info) - - region_info = dict([ - (region_name, extract_region_info(region_name, r_config)) - for region_name, r_config in regions.iteritems()]) - - fill_in_endpoints(region_info, role_zones, role_endpoints) - - # clear out the old defaults - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - # removing these objects from .rgw.root and the per-zone root pools - # may or may not matter - rados(ctx, mon, - cmd=['-p', '.rgw.root', 'rm', 'region_info.default']) - rados(ctx, mon, - cmd=['-p', '.rgw.root', 'rm', 'zone_info.default']) - - for client in config.iterkeys(): - for role, (_, zone, zone_info, user_info) in role_zones.iteritems(): - rados(ctx, mon, - cmd=['-p', zone_info['domain_root'], - 'rm', 'region_info.default']) - rados(ctx, mon, - cmd=['-p', zone_info['domain_root'], - 'rm', 'zone_info.default']) - - (remote,) = ctx.cluster.only(role).remotes.keys() - for pool_info in zone_info['placement_pools']: - remote.run(args=['ceph', 'osd', 'pool', 'create', - pool_info['val']['index_pool'], '64', '64']) - if ctx.rgw.ec_data_pool: - create_ec_pool(remote, pool_info['val']['data_pool'], - zone, 64) - else: - create_replicated_pool( - remote, pool_info['val']['data_pool'], - 64) - - rgwadmin(ctx, client, - cmd=['-n', client, 'zone', 'set', '--rgw-zone', zone], - stdin=StringIO(json.dumps(dict( - zone_info.items() + user_info.items()))), - check_status=True) - - for region, info in region_info.iteritems(): - region_json = json.dumps(info) - log.debug('region info is: %s', region_json) - rgwadmin(ctx, client, - cmd=['-n', client, 'region', 'set'], - stdin=StringIO(region_json), - check_status=True) - if info['is_master']: - rgwadmin(ctx, client, - cmd=['-n', client, - 'region', 'default', - '--rgw-region', region], - check_status=True) - - rgwadmin(ctx, client, cmd=['-n', client, 'regionmap', 'update']) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Either use configure apache to run a rados gateway, or use the built-in - civetweb server. - Only one should be run per machine, since it uses a hard-coded port for - now. - - For example, to run rgw on all clients:: - - tasks: - - ceph: - - rgw: - - To only run on certain clients:: - - tasks: - - ceph: - - rgw: [client.0, client.3] - - or - - tasks: - - ceph: - - rgw: - client.0: - client.3: - - You can adjust the idle timeout for fastcgi (default is 30 seconds): - - tasks: - - ceph: - - rgw: - client.0: - idle_timeout: 90 - - To run radosgw through valgrind: - - tasks: - - ceph: - - rgw: - client.0: - valgrind: [--tool=memcheck] - client.3: - valgrind: [--tool=memcheck] - - To use civetweb instead of apache: - - tasks: - - ceph: - - rgw: - - client.0 - overrides: - rgw: - frontend: civetweb - - Note that without a modified fastcgi module e.g. with the default - one on CentOS, you must have rgw print continue = false in ceph.conf:: - - tasks: - - ceph: - conf: - global: - rgw print continue: false - - rgw: [client.0] - - To run rgws for multiple regions or zones, describe the regions - and their zones in a regions section. The endpoints will be - generated by this task. Each client must have a region, zone, - and pools assigned in ceph.conf:: - - tasks: - - install: - - ceph: - conf: - client.0: - rgw region: foo - rgw zone: foo-1 - rgw region root pool: .rgw.rroot.foo - rgw zone root pool: .rgw.zroot.foo - rgw log meta: true - rgw log data: true - client.1: - rgw region: bar - rgw zone: bar-master - rgw region root pool: .rgw.rroot.bar - rgw zone root pool: .rgw.zroot.bar - rgw log meta: true - rgw log data: true - client.2: - rgw region: bar - rgw zone: bar-secondary - rgw region root pool: .rgw.rroot.bar - rgw zone root pool: .rgw.zroot.bar-secondary - - rgw: - ec-data-pool: true - regions: - foo: - api name: api_name # default: region name - is master: true # default: false - master zone: foo-1 # default: first zone - zones: [foo-1] - log meta: true - log data: true - placement targets: [target1, target2] # default: [] - default placement: target2 # default: '' - bar: - api name: bar-api - zones: [bar-master, bar-secondary] - client.0: - system user: - name: foo-system - access key: X2IYPSTY1072DDY1SJMC - secret key: YIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm - client.1: - system user: - name: bar1 - access key: Y2IYPSTY1072DDY1SJMC - secret key: XIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm - client.2: - system user: - name: bar2 - access key: Z2IYPSTY1072DDY1SJMC - secret key: ZIMHICpPvT+MhLTbSsiBJ1jQF15IFvJA8tgwJEcm - """ - if config is None: - config = dict(('client.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type( - ctx.cluster, 'client')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - overrides = ctx.config.get('overrides', {}) - teuthology.deep_merge(config, overrides.get('rgw', {})) - - regions = {} - if 'regions' in config: - # separate region info so only clients are keys in config - regions = config['regions'] - del config['regions'] - - role_endpoints = assign_ports(ctx, config) - ctx.rgw = argparse.Namespace() - ctx.rgw.role_endpoints = role_endpoints - # stash the region info for later, since it was deleted from the config - # structure - ctx.rgw.regions = regions - - ctx.rgw.ec_data_pool = False - if 'ec-data-pool' in config: - ctx.rgw.ec_data_pool = bool(config['ec-data-pool']) - del config['ec-data-pool'] - ctx.rgw.cache_pools = False - if 'cache-pools' in config: - ctx.rgw.cache_pools = bool(config['cache-pools']) - del config['cache-pools'] - - ctx.rgw.frontend = 'apache' - if 'frontend' in config: - ctx.rgw.frontend = config['frontend'] - del config['frontend'] - - subtasks = [ - lambda: configure_regions_and_zones( - ctx=ctx, - config=config, - regions=regions, - role_endpoints=role_endpoints, - ), - lambda: configure_users( - ctx=ctx, - config=config, - everywhere=bool(regions), - ), - lambda: create_nonregion_pools( - ctx=ctx, config=config, regions=regions), - ] - if ctx.rgw.frontend == 'apache': - subtasks.insert(0, lambda: create_apache_dirs(ctx=ctx, config=config)) - subtasks.extend([ - lambda: ship_apache_configs(ctx=ctx, config=config, - role_endpoints=role_endpoints), - lambda: start_rgw(ctx=ctx, config=config), - lambda: start_apache(ctx=ctx, config=config), - ]) - elif ctx.rgw.frontend == 'civetweb': - subtasks.extend([ - lambda: start_rgw(ctx=ctx, config=config), - ]) - else: - raise ValueError("frontend must be 'apache' or 'civetweb'") - - log.info("Using %s as radosgw frontend", ctx.rgw.frontend) - with contextutil.nested(*subtasks): - yield diff --git a/teuthology/task/rgw_logsocket.py b/teuthology/task/rgw_logsocket.py deleted file mode 100644 index 6f49b00d8a..0000000000 --- a/teuthology/task/rgw_logsocket.py +++ /dev/null @@ -1,161 +0,0 @@ -""" -rgw s3tests logging wrappers -""" -from cStringIO import StringIO -from configobj import ConfigObj -import contextlib -import logging -import s3tests - -from teuthology import misc as teuthology -from teuthology import contextutil - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Run s3tests download function - """ - return s3tests.download(ctx, config) - -def _config_user(s3tests_conf, section, user): - """ - Run s3tests user config function - """ - return s3tests._config_user(s3tests_conf, section, user) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Run s3tests user create function - """ - return s3tests.create_users(ctx, config) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Run s3tests user configure function - """ - return s3tests.configure(ctx, config) - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run remote netcat tests - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - client_config['extra_args'] = [ - 's3tests.functional.test_s3:test_bucket_list_return_data', - ] -# args = [ -# 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), -# '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir), -# '-w', -# '{tdir}/s3-tests'.format(tdir=testdir), -# '-v', -# 's3tests.functional.test_s3:test_bucket_list_return_data', -# ] -# if client_config is not None and 'extra_args' in client_config: -# args.extend(client_config['extra_args']) -# -# ctx.cluster.only(client).run( -# args=args, -# ) - - s3tests.run_tests(ctx, config) - - netcat_out = StringIO() - - for client, client_config in config.iteritems(): - ctx.cluster.only(client).run( - args = [ - 'netcat', - '-w', '5', - '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir), - ], - stdout = netcat_out, - ) - - out = netcat_out.getvalue() - - assert len(out) > 100 - - log.info('Received', out) - - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run some s3-tests suite against rgw, verify opslog socket returns data - - Must restrict testing to a particular client:: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: [client.0] - - To pass extra arguments to nose (e.g. to run a certain test):: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: - client.0: - extra_args: ['test_s3:test_object_acl_grand_public_read'] - client.1: - extra_args: ['--exclude', 'test_100_continue'] - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for (client, cconf) in config.iteritems(): - teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {})) - - log.debug('config is %s', config) - - s3tests_conf = {} - for client in clients: - s3tests_conf[client] = ConfigObj( - indent_type='', - infile={ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : 'no', - }, - 'fixtures' : {}, - 's3 main' : {}, - 's3 alt' : {}, - } - ) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - yield diff --git a/teuthology/task/s3readwrite.py b/teuthology/task/s3readwrite.py deleted file mode 100644 index 476015d76b..0000000000 --- a/teuthology/task/s3readwrite.py +++ /dev/null @@ -1,346 +0,0 @@ -""" -Run rgw s3 readwite tests -""" -from cStringIO import StringIO -import base64 -import contextlib -import logging -import os -import random -import string -import yaml - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..config import config as teuth_config -from ..orchestra import run -from ..orchestra.connection import split_user - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, dict) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - for (client, cconf) in config.items(): - branch = cconf.get('force-branch', None) - if not branch: - branch = cconf.get('branch', 'master') - sha1 = cconf.get('sha1') - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - '-b', branch, - teuth_config.ceph_git_base_url + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - if sha1 is not None: - ctx.cluster.only(client).run( - args=[ - 'cd', '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - 'git', 'reset', '--hard', sha1, - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a default s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3': 'foo'} - cached_client_user_names = dict() - for client in config['clients']: - cached_client_user_names[client] = dict() - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('readwrite', {}) - s3tests_conf['readwrite'].setdefault('bucket', 'rwtest-' + client + '-{random}-') - s3tests_conf['readwrite'].setdefault('readers', 10) - s3tests_conf['readwrite'].setdefault('writers', 3) - s3tests_conf['readwrite'].setdefault('duration', 300) - s3tests_conf['readwrite'].setdefault('files', {}) - rwconf = s3tests_conf['readwrite'] - rwconf['files'].setdefault('num', 10) - rwconf['files'].setdefault('size', 2000) - rwconf['files'].setdefault('stddev', 500) - for section, user in users.iteritems(): - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - log.debug('creating user {user} on {client}'.format(user=s3tests_conf[section]['user_id'], - client=client)) - - # stash the 'delete_user' flag along with user name for easier cleanup - delete_this_user = True - if 'delete_user' in s3tests_conf['s3']: - delete_this_user = s3tests_conf['s3']['delete_user'] - log.debug('delete_user set to {flag} for {client}'.format(flag=delete_this_user, client=client)) - cached_client_user_names[client][section+user] = (s3tests_conf[section]['user_id'], delete_this_user) - - # skip actual user creation if the create_user flag is set to false for this client - if 'create_user' in s3tests_conf['s3'] and s3tests_conf['s3']['create_user'] == False: - log.debug('create_user set to False, skipping user creation for {client}'.format(client=client)) - continue - else: - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - ], - ) - try: - yield - finally: - for client in config['clients']: - for section, user in users.iteritems(): - #uid = '{user}.{client}'.format(user=user, client=client) - real_uid, delete_this_user = cached_client_user_names[client][section+user] - if delete_this_user: - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'rm', - '--uid', real_uid, - '--purge-data', - ], - ) - else: - log.debug('skipping delete for user {uid} on {client}'.format(uid=real_uid, client=client)) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-readwrite-tests...') - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['s3']['host'] = host - else: - s3tests_conf['s3']['host'] = 'localhost' - - def_conf = s3tests_conf['DEFAULT'] - s3tests_conf['s3'].setdefault('port', def_conf['port']) - s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure']) - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=teuthology.get_testdir(ctx)), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - conf = dict( - s3=s3tests_conf['s3'], - readwrite=s3tests_conf['readwrite'], - ) - yaml.safe_dump(conf, conf_fp, default_flow_style=False) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=teuthology.get_testdir(ctx), client=client), - data=conf_fp.getvalue(), - ) - yield - - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3readwrite tests after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - conf = teuthology.get_file(remote, '{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=testdir, client=client)) - args = [ - '{tdir}/s3-tests/virtualenv/bin/s3tests-test-readwrite'.format(tdir=testdir), - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - stdin=conf, - ) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3tests-test-readwrite suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3readwrite: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - s3readwrite: - client.0: - rgw_server: client.1 - - To pass extra test arguments - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: - client.0: - readwrite: - bucket: mybucket - readers: 10 - writers: 3 - duration: 600 - files: - num: 10 - size: 2000 - stddev: 500 - client.1: - ... - - To override s3 configuration - - tasks: - - ceph: - - rgw: [client.0] - - s3readwrite: - client.0: - s3: - user_id: myuserid - display_name: myname - email: my@email - access_key: myaccesskey - secret_key: mysecretkey - - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for client in config.iterkeys(): - if not config[client]: - config[client] = {} - teuthology.deep_merge(config[client], overrides.get('s3readwrite', {})) - - log.debug('in s3readwrite, config is %s', config) - - s3tests_conf = {} - for client in clients: - if config[client] is None: - config[client] = {} - config[client].setdefault('s3', {}) - config[client].setdefault('readwrite', {}) - - s3tests_conf[client] = ({ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : False, - }, - 'readwrite' : config[client]['readwrite'], - 's3' : config[client]['s3'], - }) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/teuthology/task/s3roundtrip.py b/teuthology/task/s3roundtrip.py deleted file mode 100644 index 5a7093d6f4..0000000000 --- a/teuthology/task/s3roundtrip.py +++ /dev/null @@ -1,302 +0,0 @@ -""" -Run rgw roundtrip message tests -""" -from cStringIO import StringIO -import base64 -import contextlib -import logging -import os -import random -import string -import yaml - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..config import config as teuth_config -from ..orchestra import run -from ..orchestra.connection import split_user - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, list) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - teuth_config.ceph_git_base_url + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a default s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3': 'foo'} - for client in config['clients']: - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('roundtrip', {}) - s3tests_conf['roundtrip'].setdefault('bucket', 'rttest-' + client + '-{random}-') - s3tests_conf['roundtrip'].setdefault('readers', 10) - s3tests_conf['roundtrip'].setdefault('writers', 3) - s3tests_conf['roundtrip'].setdefault('duration', 300) - s3tests_conf['roundtrip'].setdefault('files', {}) - rtconf = s3tests_conf['roundtrip'] - rtconf['files'].setdefault('num', 10) - rtconf['files'].setdefault('size', 2000) - rtconf['files'].setdefault('stddev', 500) - for section, user in [('s3', 'foo')]: - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - ], - ) - try: - yield - finally: - for client in config['clients']: - for user in users.itervalues(): - uid = '{user}.{client}'.format(user=user, client=client) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'rm', - '--uid', uid, - '--purge-data', - ], - ) - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-roundtrip-tests...') - testdir = teuthology.get_testdir(ctx) - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['s3']['host'] = host - else: - s3tests_conf['s3']['host'] = 'localhost' - - def_conf = s3tests_conf['DEFAULT'] - s3tests_conf['s3'].setdefault('port', def_conf['port']) - s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure']) - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - conf = dict( - s3=s3tests_conf['s3'], - roundtrip=s3tests_conf['roundtrip'], - ) - yaml.safe_dump(conf, conf_fp, default_flow_style=False) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client), - data=conf_fp.getvalue(), - ) - yield - - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3 roundtrip after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - (remote,) = ctx.cluster.only(client).remotes.keys() - conf = teuthology.get_file(remote, '{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client)) - args = [ - '{tdir}/s3-tests/virtualenv/bin/s3tests-test-roundtrip'.format(tdir=testdir), - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - stdin=conf, - ) - yield - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3tests-test-roundtrip suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3roundtrip: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - s3roundtrip: - client.0: - rgw_server: client.1 - - To pass extra test arguments - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: - client.0: - roundtrip: - bucket: mybucket - readers: 10 - writers: 3 - duration: 600 - files: - num: 10 - size: 2000 - stddev: 500 - client.1: - ... - - To override s3 configuration - - tasks: - - ceph: - - rgw: [client.0] - - s3roundtrip: - client.0: - s3: - user_id: myuserid - display_name: myname - email: my@email - access_key: myaccesskey - secret_key: mysecretkey - - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - s3tests_conf = {} - for client in clients: - if config[client] is None: - config[client] = {} - config[client].setdefault('s3', {}) - config[client].setdefault('roundtrip', {}) - - s3tests_conf[client] = ({ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : False, - }, - 'roundtrip' : config[client]['roundtrip'], - 's3' : config[client]['s3'], - }) - - with contextutil.nested( - lambda: download(ctx=ctx, config=clients), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/teuthology/task/s3tests.py b/teuthology/task/s3tests.py deleted file mode 100644 index abbacb9bfa..0000000000 --- a/teuthology/task/s3tests.py +++ /dev/null @@ -1,402 +0,0 @@ -""" -Run a set of s3 tests on rgw. -""" -from cStringIO import StringIO -from configobj import ConfigObj -import base64 -import contextlib -import logging -import os -import random -import string - -import teuthology.task_util.rgw as rgw_utils - -from teuthology import misc as teuthology -from teuthology import contextutil -from ..config import config as teuth_config -from ..orchestra import run -from ..orchestra.connection import split_user - -log = logging.getLogger(__name__) - -def extract_sync_client_data(ctx, client_name): - """ - Extract synchronized client rgw zone and rgw region information. - - :param ctx: Context passed to the s3tests task - :param name: Name of client that we are synching with - """ - return_region_name = None - return_dict = None - client = ctx.ceph.conf.get(client_name, None) - if client: - current_client_zone = client.get('rgw zone', None) - if current_client_zone: - (endpoint_host, endpoint_port) = ctx.rgw.role_endpoints.get(client_name, (None, None)) - # pull out the radosgw_agent stuff - regions = ctx.rgw.regions - for region in regions: - log.debug('jbuck, region is {region}'.format(region=region)) - region_data = ctx.rgw.regions[region] - log.debug('region data is {region}'.format(region=region_data)) - zones = region_data['zones'] - for zone in zones: - if current_client_zone in zone: - return_region_name = region - return_dict = dict() - return_dict['api_name'] = region_data['api name'] - return_dict['is_master'] = region_data['is master'] - return_dict['port'] = endpoint_port - return_dict['host'] = endpoint_host - - # The s3tests expect the sync_agent_[addr|port} to be - # set on the non-master node for some reason - if not region_data['is master']: - (rgwagent_host, rgwagent_port) = ctx.radosgw_agent.endpoint - (return_dict['sync_agent_addr'], _) = ctx.rgw.role_endpoints[rgwagent_host] - return_dict['sync_agent_port'] = rgwagent_port - - else: #if client_zone: - log.debug('No zone info for {host}'.format(host=client_name)) - else: # if client - log.debug('No ceph conf for {host}'.format(host=client_name)) - - return return_region_name, return_dict - -def update_conf_with_region_info(ctx, config, s3tests_conf): - """ - Scan for a client (passed in s3tests_conf) that is an s3agent - with which we can sync. Update information in local conf file - if such a client is found. - """ - for key in s3tests_conf.keys(): - # we'll assume that there's only one sync relationship (source / destination) with client.X - # as the key for now - - # Iterate through all of the radosgw_agent (rgwa) configs and see if a - # given client is involved in a relationship. - # If a given client isn't, skip it - this_client_in_rgwa_config = False - for rgwa in ctx.radosgw_agent.config.keys(): - rgwa_data = ctx.radosgw_agent.config[rgwa] - - if key in rgwa_data['src'] or key in rgwa_data['dest']: - this_client_in_rgwa_config = True - log.debug('{client} is in an radosgw-agent sync relationship'.format(client=key)) - radosgw_sync_data = ctx.radosgw_agent.config[key] - break - if not this_client_in_rgwa_config: - log.debug('{client} is NOT in an radosgw-agent sync relationship'.format(client=key)) - continue - - source_client = radosgw_sync_data['src'] - dest_client = radosgw_sync_data['dest'] - - # #xtract the pertinent info for the source side - source_region_name, source_region_dict = extract_sync_client_data(ctx, source_client) - log.debug('\t{key} source_region {source_region} source_dict {source_dict}'.format - (key=key,source_region=source_region_name,source_dict=source_region_dict)) - - # The source *should* be the master region, but test anyway and then set it as the default region - if source_region_dict['is_master']: - log.debug('Setting {region} as default_region'.format(region=source_region_name)) - s3tests_conf[key]['fixtures'].setdefault('default_region', source_region_name) - - # Extract the pertinent info for the destination side - dest_region_name, dest_region_dict = extract_sync_client_data(ctx, dest_client) - log.debug('\t{key} dest_region {dest_region} dest_dict {dest_dict}'.format - (key=key,dest_region=dest_region_name,dest_dict=dest_region_dict)) - - # now add these regions to the s3tests_conf object - s3tests_conf[key]['region {region_name}'.format(region_name=source_region_name)] = source_region_dict - s3tests_conf[key]['region {region_name}'.format(region_name=dest_region_name)] = dest_region_dict - -@contextlib.contextmanager -def download(ctx, config): - """ - Download the s3 tests from the git builder. - Remove downloaded s3 file upon exit. - - The context passed in should be identical to the context - passed in to the main task. - """ - assert isinstance(config, dict) - log.info('Downloading s3-tests...') - testdir = teuthology.get_testdir(ctx) - for (client, cconf) in config.items(): - branch = cconf.get('force-branch', None) - if not branch: - branch = cconf.get('branch', 'master') - sha1 = cconf.get('sha1') - ctx.cluster.only(client).run( - args=[ - 'git', 'clone', - '-b', branch, - teuth_config.ceph_git_base_url + 's3-tests.git', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - if sha1 is not None: - ctx.cluster.only(client).run( - args=[ - 'cd', '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - 'git', 'reset', '--hard', sha1, - ], - ) - try: - yield - finally: - log.info('Removing s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client in config: - ctx.cluster.only(client).run( - args=[ - 'rm', - '-rf', - '{tdir}/s3-tests'.format(tdir=testdir), - ], - ) - - -def _config_user(s3tests_conf, section, user): - """ - Configure users for this section by stashing away keys, ids, and - email addresses. - """ - s3tests_conf[section].setdefault('user_id', user) - s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user)) - s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user)) - s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20))) - s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40))) - - -@contextlib.contextmanager -def create_users(ctx, config): - """ - Create a main and an alternate s3 user. - """ - assert isinstance(config, dict) - log.info('Creating rgw users...') - testdir = teuthology.get_testdir(ctx) - users = {'s3 main': 'foo', 's3 alt': 'bar'} - for client in config['clients']: - s3tests_conf = config['s3tests_conf'][client] - s3tests_conf.setdefault('fixtures', {}) - s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-') - for section, user in users.iteritems(): - _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client)) - log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client)) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'create', - '--uid', s3tests_conf[section]['user_id'], - '--display-name', s3tests_conf[section]['display_name'], - '--access-key', s3tests_conf[section]['access_key'], - '--secret', s3tests_conf[section]['secret_key'], - '--email', s3tests_conf[section]['email'], - ], - ) - try: - yield - finally: - for client in config['clients']: - for user in users.itervalues(): - uid = '{user}.{client}'.format(user=user, client=client) - ctx.cluster.only(client).run( - args=[ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin', - '-n', client, - 'user', 'rm', - '--uid', uid, - '--purge-data', - ], - ) - - -@contextlib.contextmanager -def configure(ctx, config): - """ - Configure the s3-tests. This includes the running of the - bootstrap code and the updating of local conf files. - """ - assert isinstance(config, dict) - log.info('Configuring s3-tests...') - testdir = teuthology.get_testdir(ctx) - for client, properties in config['clients'].iteritems(): - s3tests_conf = config['s3tests_conf'][client] - if properties is not None and 'rgw_server' in properties: - host = None - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - log.info('roles: ' + str(roles)) - log.info('target: ' + str(target)) - if properties['rgw_server'] in roles: - _, host = split_user(target) - assert host is not None, "Invalid client specified as the rgw_server" - s3tests_conf['DEFAULT']['host'] = host - else: - s3tests_conf['DEFAULT']['host'] = 'localhost' - - (remote,) = ctx.cluster.only(client).remotes.keys() - remote.run( - args=[ - 'cd', - '{tdir}/s3-tests'.format(tdir=testdir), - run.Raw('&&'), - './bootstrap', - ], - ) - conf_fp = StringIO() - s3tests_conf.write(conf_fp) - teuthology.write_file( - remote=remote, - path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), - data=conf_fp.getvalue(), - ) - yield - -@contextlib.contextmanager -def sync_users(ctx, config): - """ - Sync this user. - """ - assert isinstance(config, dict) - # do a full sync if this is a multi-region test - if rgw_utils.multi_region_enabled(ctx): - log.debug('Doing a full sync') - rgw_utils.radosgw_agent_sync_all(ctx) - else: - log.debug('Not a multi-region config; skipping the metadata sync') - - yield - -@contextlib.contextmanager -def run_tests(ctx, config): - """ - Run the s3tests after everything is set up. - - :param ctx: Context passed to task - :param config: specific configuration information - """ - assert isinstance(config, dict) - testdir = teuthology.get_testdir(ctx) - for client, client_config in config.iteritems(): - args = [ - 'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client), - '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir), - '-w', - '{tdir}/s3-tests'.format(tdir=testdir), - '-v', - '-a', '!fails_on_rgw', - ] - if client_config is not None and 'extra_args' in client_config: - args.extend(client_config['extra_args']) - - ctx.cluster.only(client).run( - args=args, - ) - yield - -@contextlib.contextmanager -def task(ctx, config): - """ - Run the s3-tests suite against rgw. - - To run all tests on all clients:: - - tasks: - - ceph: - - rgw: - - s3tests: - - To restrict testing to particular clients:: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: [client.0] - - To run against a server on client.1:: - - tasks: - - ceph: - - rgw: [client.1] - - s3tests: - client.0: - rgw_server: client.1 - - To pass extra arguments to nose (e.g. to run a certain test):: - - tasks: - - ceph: - - rgw: [client.0] - - s3tests: - client.0: - extra_args: ['test_s3:test_object_acl_grand_public_read'] - client.1: - extra_args: ['--exclude', 'test_100_continue'] - """ - assert config is None or isinstance(config, list) \ - or isinstance(config, dict), \ - "task s3tests only supports a list or dictionary for configuration" - all_clients = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - if config is None: - config = all_clients - if isinstance(config, list): - config = dict.fromkeys(config) - clients = config.keys() - - overrides = ctx.config.get('overrides', {}) - # merge each client section, not the top level. - for client in config.iterkeys(): - if not config[client]: - config[client] = {} - teuthology.deep_merge(config[client], overrides.get('s3tests', {})) - - log.debug('s3tests config is %s', config) - - s3tests_conf = {} - for client in clients: - s3tests_conf[client] = ConfigObj( - indent_type='', - infile={ - 'DEFAULT': - { - 'port' : 7280, - 'is_secure' : 'no', - }, - 'fixtures' : {}, - 's3 main' : {}, - 's3 alt' : {}, - } - ) - - # Only attempt to add in the region info if there's a radosgw_agent configured - if hasattr(ctx, 'radosgw_agent'): - update_conf_with_region_info(ctx, config, s3tests_conf) - - with contextutil.nested( - lambda: download(ctx=ctx, config=config), - lambda: create_users(ctx=ctx, config=dict( - clients=clients, - s3tests_conf=s3tests_conf, - )), - lambda: sync_users(ctx=ctx, config=config), - lambda: configure(ctx=ctx, config=dict( - clients=config, - s3tests_conf=s3tests_conf, - )), - lambda: run_tests(ctx=ctx, config=config), - ): - pass - yield diff --git a/teuthology/task/samba.py b/teuthology/task/samba.py deleted file mode 100644 index 6e0932f283..0000000000 --- a/teuthology/task/samba.py +++ /dev/null @@ -1,243 +0,0 @@ -""" -Samba -""" -import contextlib -import logging -import sys -import time - -from teuthology import misc as teuthology -from ..orchestra import run -from ..orchestra.daemon import DaemonGroup - -log = logging.getLogger(__name__) - - -def get_sambas(ctx, roles): - """ - Scan for roles that are samba. Yield the id of the the samba role - (samba.0, samba.1...) and the associated remote site - - :param ctx: Context - :param roles: roles for this test (extracted from yaml files) - """ - for role in roles: - assert isinstance(role, basestring) - PREFIX = 'samba.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - yield (id_, remote) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Setup samba smbd with ceph vfs module. This task assumes the samba - package has already been installed via the install task. - - The config is optional and defaults to starting samba on all nodes. - If a config is given, it is expected to be a list of - samba nodes to start smbd servers on. - - Example that starts smbd on all samba nodes:: - - tasks: - - install: - - install: - project: samba - extra_packages: ['samba'] - - ceph: - - samba: - - interactive: - - Example that starts smbd on just one of the samba nodes and cifs on the other:: - - tasks: - - samba: [samba.0] - - cifs: [samba.1] - - An optional backend can be specified, and requires a path which smbd will - use as the backend storage location: - - roles: - - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a] - - [client.0, samba.0] - - tasks: - - ceph: - - ceph-fuse: [client.0] - - samba: - samba.0: - cephfuse: "{testdir}/mnt.0" - - This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with - a UNC of //localhost/cephfuse. Access through that UNC will be on - the ceph fuse mount point. - - If no arguments are specified in the samba - role, the default behavior is to enable the ceph UNC //localhost/ceph - and use the ceph vfs module as the smbd backend. - - :param ctx: Context - :param config: Configuration - """ - log.info("Setting up smbd with ceph vfs...") - assert config is None or isinstance(config, list) or isinstance(config, dict), \ - "task samba got invalid config" - - if config is None: - config = dict(('samba.{id}'.format(id=id_), None) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')) - elif isinstance(config, list): - config = dict((name, None) for name in config) - - samba_servers = list(get_sambas(ctx=ctx, roles=config.keys())) - - testdir = teuthology.get_testdir(ctx) - - if not hasattr(ctx, 'daemons'): - ctx.daemons = DaemonGroup() - - for id_, remote in samba_servers: - - rolestr = "samba.{id_}".format(id_=id_) - - confextras = """vfs objects = ceph - ceph:config_file = /etc/ceph/ceph.conf""" - - unc = "ceph" - backend = "/" - - if config[rolestr] is not None: - # verify that there's just one parameter in role - if len(config[rolestr]) != 1: - log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_)) - raise Exception('invalid config') - confextras = "" - (unc, backendstr) = config[rolestr].items()[0] - backend = backendstr.format(testdir=testdir) - - # on first samba role, set ownership and permissions of ceph root - # so that samba tests succeed - if config[rolestr] is None and id_ == samba_servers[0][0]: - remote.run( - args=[ - 'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'), - 'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'), - 'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'), - 'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'), - 'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'), - 'rm', '-rf', '/tmp/cmnt', - ], - ) - else: - remote.run( - args=[ - 'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'), - 'sudo', 'chmod', '1777', backend, - ], - ) - - teuthology.sudo_write_file(remote, "/usr/local/samba/etc/smb.conf", """ -[global] - workgroup = WORKGROUP - netbios name = DOMAIN - -[{unc}] - path = {backend} - {extras} - writeable = yes - valid users = ubuntu -""".format(extras=confextras, unc=unc, backend=backend)) - - # create ubuntu user - remote.run( - args=[ - 'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu', - run.Raw('||'), - 'printf', run.Raw('"ubuntu\nubuntu\n"'), - run.Raw('|'), - 'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu' - ]) - - smbd_cmd = [ - 'sudo', - 'daemon-helper', - 'kill', - 'nostdin', - '/usr/local/samba/sbin/smbd', - '-F', - ] - ctx.daemons.add_daemon(remote, 'smbd', id_, - args=smbd_cmd, - logger=log.getChild("smbd.{id_}".format(id_=id_)), - stdin=run.PIPE, - wait=False, - ) - - # let smbd initialize, probably a better way... - seconds_to_sleep = 100 - log.info('Sleeping for %s seconds...' % seconds_to_sleep) - time.sleep(seconds_to_sleep) - log.info('Sleeping stopped...') - - try: - yield - finally: - log.info('Stopping smbd processes...') - exc_info = (None, None, None) - for d in ctx.daemons.iter_daemons_of_role('smbd'): - try: - d.stop() - except (run.CommandFailedError, - run.CommandCrashedError, - run.ConnectionLostError): - exc_info = sys.exc_info() - log.exception('Saw exception from %s.%s', d.role, d.id_) - if exc_info != (None, None, None): - raise exc_info[0], exc_info[1], exc_info[2] - - for id_, remote in samba_servers: - remote.run( - args=[ - 'sudo', - 'rm', '-rf', - '/usr/local/samba/etc/smb.conf', - '/usr/local/samba/private/*', - '/usr/local/samba/var/run/', - '/usr/local/samba/var/locks', - '/usr/local/samba/var/lock', - ], - ) - # make sure daemons are gone - try: - remote.run( - args=[ - 'while', - 'sudo', 'killall', '-9', 'smbd', - run.Raw(';'), - 'do', 'sleep', '1', - run.Raw(';'), - 'done', - ], - ) - - remote.run( - args=[ - 'sudo', - 'lsof', - backend, - ], - ) - remote.run( - args=[ - 'sudo', - 'fuser', - '-M', - backend, - ], - ) - except Exception: - log.exception("Saw exception") - pass diff --git a/teuthology/task/scrub.py b/teuthology/task/scrub.py deleted file mode 100644 index 7a25300a67..0000000000 --- a/teuthology/task/scrub.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Scrub osds -""" -import contextlib -import gevent -import logging -import random -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - Run scrub periodically. Randomly chooses an OSD to scrub. - - The config should be as follows: - - scrub: - frequency: - deep: - - example: - - tasks: - - ceph: - - scrub: - frequency: 30 - deep: 0 - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'scrub task only accepts a dict for configuration' - - log.info('Beginning scrub...') - - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - while len(manager.get_osd_status()['up']) < num_osds: - manager.sleep(10) - - scrub_proc = Scrubber( - manager, - config, - ) - try: - yield - finally: - log.info('joining scrub') - scrub_proc.do_join() - -class Scrubber: - """ - Scrubbing is actually performed during initialzation - """ - def __init__(self, manager, config): - """ - Spawn scrubbing thread upon completion. - """ - self.ceph_manager = manager - self.ceph_manager.wait_for_clean() - - osd_status = self.ceph_manager.get_osd_status() - self.osds = osd_status['up'] - - self.config = config - if self.config is None: - self.config = dict() - - else: - def tmp(x): - """Local display""" - print x - self.log = tmp - - self.stopping = False - - log.info("spawning thread") - - self.thread = gevent.spawn(self.do_scrub) - - def do_join(self): - """Scrubbing thread finished""" - self.stopping = True - self.thread.get() - - def do_scrub(self): - """Perform the scrub operation""" - frequency = self.config.get("frequency", 30) - deep = self.config.get("deep", 0) - - log.info("stopping %s" % self.stopping) - - while not self.stopping: - osd = str(random.choice(self.osds)) - - if deep: - cmd = 'deep-scrub' - else: - cmd = 'scrub' - - log.info('%sbing %s' % (cmd, osd)) - self.ceph_manager.raw_cluster_cmd('osd', cmd, osd) - - time.sleep(frequency) diff --git a/teuthology/task/scrub_test.py b/teuthology/task/scrub_test.py deleted file mode 100644 index 3443ae9f45..0000000000 --- a/teuthology/task/scrub_test.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Scrub testing""" -from cStringIO import StringIO - -import logging -import os -import time - -import ceph_manager -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def task(ctx, config): - """ - Test [deep] scrub - - tasks: - - chef: - - install: - - ceph: - log-whitelist: - - '!= known digest' - - '!= known omap_digest' - - deep-scrub 0 missing, 1 inconsistent objects - - deep-scrub 1 errors - - repair 0 missing, 1 inconsistent objects - - repair 1 errors, 1 fixed - - scrub_test: - - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'scrub_test task only accepts a dict for configuration' - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - - num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') - log.info('num_osds is %s' % num_osds) - - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - logger=log.getChild('ceph_manager'), - ) - - while len(manager.get_osd_status()['up']) < num_osds: - time.sleep(10) - - for i in range(num_osds): - manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') - manager.wait_for_clean() - - # write some data - p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) - err = p.exitstatus - log.info('err is %d' % err) - - # wait for some PG to have data that we can mess with - victim = None - osd = None - while victim is None: - stats = manager.get_pg_stats() - for pg in stats: - size = pg['stat_sum']['num_bytes'] - if size > 0: - victim = pg['pgid'] - osd = pg['acting'][0] - break - - if victim is None: - time.sleep(3) - - log.info('messing with PG %s on osd %d' % (victim, osd)) - - (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys() - data_path = os.path.join( - '/var/lib/ceph/osd', - 'ceph-{id}'.format(id=osd), - 'current', - '{pg}_head'.format(pg=victim) - ) - - # fuzz time - ls_fp = StringIO() - osd_remote.run( - args=[ 'ls', data_path ], - stdout=ls_fp, - ) - ls_out = ls_fp.getvalue() - ls_fp.close() - - # find an object file we can mess with - osdfilename = None - for line in ls_out.split('\n'): - if 'object' in line: - osdfilename = line - break - assert osdfilename is not None - - # Get actual object name from osd stored filename - tmp=osdfilename.split('__') - objname=tmp[0] - objname=objname.replace('\u', '_') - log.info('fuzzing %s' % objname) - - # put a single \0 at the beginning of the file - osd_remote.run( - args=[ 'sudo', 'dd', - 'if=/dev/zero', - 'of=%s' % os.path.join(data_path, osdfilename), - 'bs=1', 'count=1', 'conv=notrunc' - ] - ) - - # scrub, verify inconsistent - manager.raw_cluster_cmd('pg', 'deep-scrub', victim) - # Give deep-scrub a chance to start - time.sleep(60) - - while True: - stats = manager.get_single_pg_stats(victim) - state = stats['state'] - - # wait for the scrub to finish - if 'scrubbing' in state: - time.sleep(3) - continue - - inconsistent = stats['state'].find('+inconsistent') != -1 - assert inconsistent - break - - - # repair, verify no longer inconsistent - manager.raw_cluster_cmd('pg', 'repair', victim) - # Give repair a chance to start - time.sleep(60) - - while True: - stats = manager.get_single_pg_stats(victim) - state = stats['state'] - - # wait for the scrub to finish - if 'scrubbing' in state: - time.sleep(3) - continue - - inconsistent = stats['state'].find('+inconsistent') != -1 - assert not inconsistent - break - - # Test deep-scrub with various omap modifications - manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val']) - manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr']) - - # Modify omap on specific osd - log.info('fuzzing omap of %s' % objname) - manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']); - manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']); - manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']); - - # scrub, verify inconsistent - manager.raw_cluster_cmd('pg', 'deep-scrub', victim) - # Give deep-scrub a chance to start - time.sleep(60) - - while True: - stats = manager.get_single_pg_stats(victim) - state = stats['state'] - - # wait for the scrub to finish - if 'scrubbing' in state: - time.sleep(3) - continue - - inconsistent = stats['state'].find('+inconsistent') != -1 - assert inconsistent - break - - # repair, verify no longer inconsistent - manager.raw_cluster_cmd('pg', 'repair', victim) - # Give repair a chance to start - time.sleep(60) - - while True: - stats = manager.get_single_pg_stats(victim) - state = stats['state'] - - # wait for the scrub to finish - if 'scrubbing' in state: - time.sleep(3) - continue - - inconsistent = stats['state'].find('+inconsistent') != -1 - assert not inconsistent - break - - log.info('test successful!') diff --git a/teuthology/task/test/__init__.py b/teuthology/task/test/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/teuthology/task/test/test_devstack.py b/teuthology/task/test/test_devstack.py deleted file mode 100644 index 117b307681..0000000000 --- a/teuthology/task/test/test_devstack.py +++ /dev/null @@ -1,48 +0,0 @@ -from textwrap import dedent - -from .. import devstack - - -class TestDevstack(object): - def test_parse_os_table(self): - table_str = dedent(""" - +---------------------+--------------------------------------+ - | Property | Value | - +---------------------+--------------------------------------+ - | attachments | [] | - | availability_zone | nova | - | bootable | false | - | created_at | 2014-02-21T17:14:47.548361 | - | display_description | None | - | display_name | NAME | - | id | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e | - | metadata | {} | - | size | 1 | - | snapshot_id | None | - | source_volid | None | - | status | creating | - | volume_type | None | - +---------------------+--------------------------------------+ - """).strip() - expected = { - 'Property': 'Value', - 'attachments': '[]', - 'availability_zone': 'nova', - 'bootable': 'false', - 'created_at': '2014-02-21T17:14:47.548361', - 'display_description': 'None', - 'display_name': 'NAME', - 'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e', - 'metadata': '{}', - 'size': '1', - 'snapshot_id': 'None', - 'source_volid': 'None', - 'status': 'creating', - 'volume_type': 'None'} - - vol_info = devstack.parse_os_table(table_str) - assert vol_info == expected - - - - diff --git a/teuthology/task/tgt.py b/teuthology/task/tgt.py deleted file mode 100644 index c2b322e082..0000000000 --- a/teuthology/task/tgt.py +++ /dev/null @@ -1,177 +0,0 @@ -""" -Task to handle tgt - -Assumptions made: - The ceph-extras tgt package may need to get installed. - The open-iscsi package needs to get installed. -""" -import logging -import contextlib - -from teuthology import misc as teuthology -from teuthology import contextutil - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def start_tgt_remotes(ctx, start_tgtd): - """ - This subtask starts up a tgtd on the clients specified - """ - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - tgtd_list = [] - for rem, roles in remotes.iteritems(): - for _id in roles: - if _id in start_tgtd: - if not rem in tgtd_list: - tgtd_list.append(rem) - size = ctx.config.get('image_size', 10240) - rem.run( - args=[ - 'rbd', - 'create', - 'iscsi-image', - '--size', - str(size), - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'target', - '--op', - 'new', - '--tid', - '1', - '--targetname', - 'rbd', - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'logicalunit', - '--op', - 'new', - '--tid', - '1', - '--lun', - '1', - '--backing-store', - 'iscsi-image', - '--bstype', - 'rbd', - ]) - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--op', - 'bind', - '--mode', - 'target', - '--tid', - '1', - '-I', - 'ALL', - ]) - try: - yield - - finally: - for rem in tgtd_list: - rem.run( - args=[ - 'sudo', - 'tgtadm', - '--lld', - 'iscsi', - '--mode', - 'target', - '--op', - 'delete', - '--force', - '--tid', - '1', - ]) - rem.run( - args=[ - 'rbd', - 'snap', - 'purge', - 'iscsi-image', - ]) - rem.run( - args=[ - 'sudo', - 'rbd', - 'rm', - 'iscsi-image', - ]) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Start up tgt. - - To start on on all clients:: - - tasks: - - ceph: - - tgt: - - To start on certain clients:: - - tasks: - - ceph: - - tgt: [client.0, client.3] - - or - - tasks: - - ceph: - - tgt: - client.0: - client.3: - - An image blocksize size can also be specified:: - - tasks: - - ceph: - - tgt: - image_size = 20480 - - The general flow of things here is: - 1. Find clients on which tgt is supposed to run (start_tgtd) - 2. Remotely start up tgt daemon - On cleanup: - 3. Stop tgt daemon - - The iscsi administration is handled by the iscsi task. - """ - if config: - config = {key : val for key, val in config.items() - if key.startswith('client')} - # config at this point should only contain keys starting with 'client' - start_tgtd = [] - remotes = ctx.cluster.only(teuthology.is_type('client')).remotes - log.info(remotes) - if not config: - start_tgtd = ['client.{id}'.format(id=id_) - for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] - else: - start_tgtd = config - log.info(start_tgtd) - with contextutil.nested( - lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),): - yield diff --git a/teuthology/task/thrashosds.py b/teuthology/task/thrashosds.py deleted file mode 100644 index ba166ed8e3..0000000000 --- a/teuthology/task/thrashosds.py +++ /dev/null @@ -1,179 +0,0 @@ -""" -Thrash -- Simulate random osd failures. -""" -import contextlib -import logging -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - "Thrash" the OSDs by randomly marking them out/down (and then back - in) until the task is ended. This loops, and every op_delay - seconds it randomly chooses to add or remove an OSD (even odds) - unless there are fewer than min_out OSDs out of the cluster, or - more than min_in OSDs in the cluster. - - All commands are run on mon0 and it stops when __exit__ is called. - - The config is optional, and is a dict containing some or all of: - - min_in: (default 3) the minimum number of OSDs to keep in the - cluster - - min_out: (default 0) the minimum number of OSDs to keep out of the - cluster - - op_delay: (5) the length of time to sleep between changing an - OSD's status - - min_dead: (0) minimum number of osds to leave down/dead. - - max_dead: (0) maximum number of osds to leave down/dead before waiting - for clean. This should probably be num_replicas - 1. - - clean_interval: (60) the approximate length of time to loop before - waiting until the cluster goes clean. (In reality this is used - to probabilistically choose when to wait, and the method used - makes it closer to -- but not identical to -- the half-life.) - - scrub_interval: (-1) the approximate length of time to loop before - waiting until a scrub is performed while cleaning. (In reality - this is used to probabilistically choose when to wait, and it - only applies to the cases where cleaning is being performed). - -1 is used to indicate that no scrubbing will be done. - - chance_down: (0.4) the probability that the thrasher will mark an - OSD down rather than marking it out. (The thrasher will not - consider that OSD out of the cluster, since presently an OSD - wrongly marked down will mark itself back up again.) This value - can be either an integer (eg, 75) or a float probability (eg - 0.75). - - chance_test_min_size: (0) chance to run test_pool_min_size, - which: - - kills all but one osd - - waits - - kills that osd - - revives all other osds - - verifies that the osds fully recover - - timeout: (360) the number of seconds to wait for the cluster - to become clean after each cluster change. If this doesn't - happen within the timeout, an exception will be raised. - - revive_timeout: (75) number of seconds to wait for an osd asok to - appear after attempting to revive the osd - - thrash_primary_affinity: (true) randomly adjust primary-affinity - - chance_pgnum_grow: (0) chance to increase a pool's size - chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool - pool_grow_by: (10) amount to increase pgnum by - max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd - - pause_short: (3) duration of short pause - pause_long: (80) duration of long pause - pause_check_after: (50) assert osd down after this long - chance_inject_pause_short: (1) chance of injecting short stall - chance_inject_pause_long: (0) chance of injecting long stall - - clean_wait: (0) duration to wait before resuming thrashing once clean - - powercycle: (false) whether to power cycle the node instead - of just the osd process. Note that this assumes that a single - osd is the only important process on the node. - - chance_test_backfill_full: (0) chance to simulate full disks stopping - backfill - - chance_test_map_discontinuity: (0) chance to test map discontinuity - map_discontinuity_sleep_time: (40) time to wait for map trims - - example: - - tasks: - - ceph: - - thrashosds: - chance_down: 10 - op_delay: 3 - min_in: 1 - timeout: 600 - - interactive: - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'thrashosds task only accepts a dict for configuration' - - if 'powercycle' in config: - - # sync everyone first to avoid collateral damage to / etc. - log.info('Doing preliminary sync to avoid collateral damage...') - ctx.cluster.run(args=['sync']) - - if 'ipmi_user' in ctx.teuthology_config: - for t, key in ctx.config['targets'].iteritems(): - host = t.split('@')[-1] - shortname = host.split('.')[0] - from ..orchestra import remote as oremote - console = oremote.getRemoteConsole( - name=host, - ipmiuser=ctx.teuthology_config['ipmi_user'], - ipmipass=ctx.teuthology_config['ipmi_password'], - ipmidomain=ctx.teuthology_config['ipmi_domain']) - cname = '{host}.{domain}'.format( - host=shortname, - domain=ctx.teuthology_config['ipmi_domain']) - log.debug('checking console status of %s' % cname) - if not console.check_status(): - log.info( - 'Failed to get console status for ' - '%s, disabling console...' - % cname) - console=None - else: - # find the remote for this console and add it - remotes = [ - r for r in ctx.cluster.remotes.keys() if r.name == t] - if len(remotes) != 1: - raise Exception( - 'Too many (or too few) remotes ' - 'found for target {t}'.format(t=t)) - remotes[0].console = console - log.debug('console ready on %s' % cname) - - # check that all osd remotes have a valid console - osds = ctx.cluster.only(teuthology.is_type('osd')) - for remote, _ in osds.remotes.iteritems(): - if not remote.console: - raise Exception( - 'IPMI console required for powercycling, ' - 'but not available on osd role: {r}'.format( - r=remote.name)) - - log.info('Beginning thrashosds...') - first_mon = teuthology.get_first_mon(ctx, config) - (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() - manager = ceph_manager.CephManager( - mon, - ctx=ctx, - config=config, - logger=log.getChild('ceph_manager'), - ) - ctx.manager = manager - thrash_proc = ceph_manager.Thrasher( - manager, - config, - logger=log.getChild('thrasher') - ) - try: - yield - finally: - log.info('joining thrashosds') - thrash_proc.do_join() - manager.wait_for_recovery(config.get('timeout', 360)) diff --git a/teuthology/task/userdata_setup.yaml b/teuthology/task/userdata_setup.yaml deleted file mode 100644 index eaa5f7353e..0000000000 --- a/teuthology/task/userdata_setup.yaml +++ /dev/null @@ -1,22 +0,0 @@ -#cloud-config-archive - -- type: text/cloud-config - content: | - output: - all: '| tee -a /var/log/cloud-init-output.log' - -# allow passwordless access for debugging -- | - #!/bin/bash - exec passwd -d ubuntu - -- | - #!/bin/bash - - # mount a 9p fs for storing logs - mkdir /mnt/log - mount -t 9p -o trans=virtio test_log /mnt/log - - # mount the iso image that has the test script - mkdir /mnt/cdrom - mount -t auto /dev/cdrom /mnt/cdrom diff --git a/teuthology/task/userdata_teardown.yaml b/teuthology/task/userdata_teardown.yaml deleted file mode 100644 index 7f3d64ff74..0000000000 --- a/teuthology/task/userdata_teardown.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- | - #!/bin/bash - cp /var/log/cloud-init-output.log /mnt/log - -- | - #!/bin/bash - umount /mnt/log - -- | - #!/bin/bash - shutdown -h -P now diff --git a/teuthology/task/watch_notify_stress.py b/teuthology/task/watch_notify_stress.py deleted file mode 100644 index ab611c3dd4..0000000000 --- a/teuthology/task/watch_notify_stress.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -test_stress_watch task -""" -import contextlib -import logging -import proc_thrasher - -from ..orchestra import run - -log = logging.getLogger(__name__) - - -@contextlib.contextmanager -def task(ctx, config): - """ - Run test_stress_watch - - The config should be as follows: - - test_stress_watch: - clients: [client list] - - example: - - tasks: - - ceph: - - test_stress_watch: - clients: [client.0] - - interactive: - """ - log.info('Beginning test_stress_watch...') - assert isinstance(config, dict), \ - "please list clients to run on" - testwatch = {} - - remotes = [] - - for role in config.get('clients', ['client.0']): - assert isinstance(role, basestring) - PREFIX = 'client.' - assert role.startswith(PREFIX) - id_ = role[len(PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - remotes.append(remote) - - args =['CEPH_CLIENT_ID={id_}'.format(id_=id_), - 'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')), - 'daemon-helper', - 'kill', - 'multi_stress_watch foo foo' - ] - - log.info("args are %s" % (args,)) - - proc = proc_thrasher.ProcThrasher({}, remote, - args=[run.Raw(i) for i in args], - logger=log.getChild('testwatch.{id}'.format(id=id_)), - stdin=run.PIPE, - wait=False - ) - proc.start() - testwatch[id_] = proc - - try: - yield - finally: - log.info('joining watch_notify_stress') - for i in testwatch.itervalues(): - i.join() diff --git a/teuthology/task/workunit.py b/teuthology/task/workunit.py deleted file mode 100644 index e79b551ea3..0000000000 --- a/teuthology/task/workunit.py +++ /dev/null @@ -1,372 +0,0 @@ -""" -Workunit task -- Run ceph on sets of specific clients -""" -import logging -import pipes -import os - -from teuthology import misc -from teuthology.orchestra.run import CommandFailedError -from teuthology.parallel import parallel -from ..orchestra import run - -log = logging.getLogger(__name__) - -CLIENT_PREFIX = 'client.' - - -def task(ctx, config): - """ - Run ceph on all workunits found under the specified path. - - For example:: - - tasks: - - ceph: - - ceph-fuse: [client.0] - - workunit: - clients: - client.0: [direct_io, xattrs.sh] - client.1: [snaps] - branch: foo - - You can also run a list of workunits on all clients: - tasks: - - ceph: - - ceph-fuse: - - workunit: - tag: v0.47 - clients: - all: [direct_io, xattrs.sh, snaps] - - If you have an "all" section it will run all the workunits - on each client simultaneously, AFTER running any workunits specified - for individual clients. (This prevents unintended simultaneous runs.) - - To customize tests, you can specify environment variables as a dict. You - can also specify a time limit for each work unit (defaults to 3h): - - tasks: - - ceph: - - ceph-fuse: - - workunit: - sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6 - clients: - all: [snaps] - env: - FOO: bar - BAZ: quux - timeout: 3h - - :param ctx: Context - :param config: Configuration - """ - assert isinstance(config, dict) - assert isinstance(config.get('clients'), dict), \ - 'configuration must contain a dictionary of clients' - - overrides = ctx.config.get('overrides', {}) - misc.deep_merge(config, overrides.get('workunit', {})) - - refspec = config.get('branch') - if refspec is None: - refspec = config.get('sha1') - if refspec is None: - refspec = config.get('tag') - if refspec is None: - refspec = 'HEAD' - - timeout = config.get('timeout', '3h') - - log.info('Pulling workunits from ref %s', refspec) - - created_mountpoint = {} - - if config.get('env') is not None: - assert isinstance(config['env'], dict), 'env must be a dictionary' - clients = config['clients'] - - # Create scratch dirs for any non-all workunits - log.info('Making a separate scratch dir for every client...') - for role in clients.iterkeys(): - assert isinstance(role, basestring) - if role == "all": - continue - - assert role.startswith(CLIENT_PREFIX) - created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir')) - created_mountpoint[role] = created_mnt_dir - - # Execute any non-all workunits - with parallel() as p: - for role, tests in clients.iteritems(): - if role != "all": - p.spawn(_run_tests, ctx, refspec, role, tests, - config.get('env'), timeout=timeout) - - # Clean up dirs from any non-all workunits - for role, created in created_mountpoint.items(): - _delete_dir(ctx, role, created) - - # Execute any 'all' workunits - if 'all' in clients: - all_tasks = clients["all"] - _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), - config.get('subdir'), timeout=timeout) - - -def _delete_dir(ctx, role, created_mountpoint): - """ - Delete file used by this role, and delete the directory that this - role appeared in. - - :param ctx: Context - :param role: "role.#" where # is used for the role id. - """ - testdir = misc.get_testdir(ctx) - id_ = role[len(CLIENT_PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - # Is there any reason why this is not: join(mnt, role) ? - client = os.path.join(mnt, 'client.{id}'.format(id=id_)) - - # Remove the directory inside the mount where the workunit ran - remote.run( - args=[ - 'sudo', - 'rm', - '-rf', - '--', - client, - ], - ) - log.info("Deleted dir {dir}".format(dir=mnt)) - - # If the mount was an artificially created dir, delete that too - if created_mountpoint: - remote.run( - args=[ - 'rmdir', - '--', - mnt, - ], - ) - log.info("Deleted artificial mount point {dir}".format(dir=client)) - - -def _make_scratch_dir(ctx, role, subdir): - """ - Make scratch directories for this role. This also makes the mount - point if that directory does not exist. - - :param ctx: Context - :param role: "role.#" where # is used for the role id. - :param subdir: use this subdir (False if not used) - """ - created_mountpoint = False - id_ = role[len(CLIENT_PREFIX):] - log.debug("getting remote for {id} role {role_}".format(id=id_, role_=role)) - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - dir_owner = remote.user - mnt = os.path.join(misc.get_testdir(ctx), 'mnt.{id}'.format(id=id_)) - # if neither kclient nor ceph-fuse are required for a workunit, - # mnt may not exist. Stat and create the directory if it doesn't. - try: - remote.run( - args=[ - 'stat', - '--', - mnt, - ], - ) - log.info('Did not need to create dir {dir}'.format(dir=mnt)) - except CommandFailedError: - remote.run( - args=[ - 'mkdir', - '--', - mnt, - ], - ) - log.info('Created dir {dir}'.format(dir=mnt)) - created_mountpoint = True - - if not subdir: - subdir = 'client.{id}'.format(id=id_) - - if created_mountpoint: - remote.run( - args=[ - 'cd', - '--', - mnt, - run.Raw('&&'), - 'mkdir', - '--', - subdir, - ], - ) - else: - remote.run( - args=[ - # cd first so this will fail if the mount point does - # not exist; pure install -d will silently do the - # wrong thing - 'cd', - '--', - mnt, - run.Raw('&&'), - 'sudo', - 'install', - '-d', - '-m', '0755', - '--owner={user}'.format(user=dir_owner), - '--', - subdir, - ], - ) - - return created_mountpoint - - -def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None): - """ - Make a scratch directory for each client in the cluster, and then for each - test spawn _run_tests() for each role. - - See run_tests() for parameter documentation. - """ - client_generator = misc.all_roles_of_type(ctx.cluster, 'client') - client_remotes = list() - - created_mountpoint = {} - for client in client_generator: - (client_remote,) = ctx.cluster.only('client.{id}'.format(id=client)).remotes.iterkeys() - client_remotes.append((client_remote, 'client.{id}'.format(id=client))) - created_mountpoint[client] = _make_scratch_dir(ctx, "client.{id}".format(id=client), subdir) - - for unit in tests: - with parallel() as p: - for remote, role in client_remotes: - p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir, - timeout=timeout) - - # cleanup the generated client directories - client_generator = misc.all_roles_of_type(ctx.cluster, 'client') - for client in client_generator: - _delete_dir(ctx, 'client.{id}'.format(id=client), created_mountpoint[client]) - - -def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None): - """ - Run the individual test. Create a scratch directory and then extract the - workunits from git. Make the executables, and then run the tests. - Clean up (remove files created) after the tests are finished. - - :param ctx: Context - :param refspec: branch, sha1, or version tag used to identify this - build - :param tests: specific tests specified. - :param env: environment set in yaml file. Could be None. - :param subdir: subdirectory set in yaml file. Could be None - :param timeout: If present, use the 'timeout' command on the remote host - to limit execution time. Must be specified by a number - followed by 's' for seconds, 'm' for minutes, 'h' for - hours, or 'd' for days. If '0' or anything that evaluates - to False is passed, the 'timeout' command is not used. - """ - testdir = misc.get_testdir(ctx) - assert isinstance(role, basestring) - assert role.startswith(CLIENT_PREFIX) - id_ = role[len(CLIENT_PREFIX):] - (remote,) = ctx.cluster.only(role).remotes.iterkeys() - mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) - # subdir so we can remove and recreate this a lot without sudo - if subdir is None: - scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp') - else: - scratch_tmp = os.path.join(mnt, subdir) - srcdir = '{tdir}/workunit.{role}'.format(tdir=testdir, role=role) - - remote.run( - logger=log.getChild(role), - args=[ - 'mkdir', '--', srcdir, - run.Raw('&&'), - 'git', - 'archive', - '--remote=git://ceph.newdream.net/git/ceph.git', - '%s:qa/workunits' % refspec, - run.Raw('|'), - 'tar', - '-C', srcdir, - '-x', - '-f-', - run.Raw('&&'), - 'cd', '--', srcdir, - run.Raw('&&'), - 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi', - run.Raw('&&'), - 'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir), - run.Raw('>{tdir}/workunits.list'.format(tdir=testdir)), - ], - ) - - workunits = sorted(misc.get_file( - remote, - '{tdir}/workunits.list'.format(tdir=testdir)).split('\0')) - assert workunits - - try: - assert isinstance(tests, list) - for spec in tests: - log.info('Running workunits matching %s on %s...', spec, role) - prefix = '{spec}/'.format(spec=spec) - to_run = [w for w in workunits if w == spec or w.startswith(prefix)] - if not to_run: - raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec)) - for workunit in to_run: - log.info('Running workunit %s...', workunit) - args = [ - 'mkdir', '-p', '--', scratch_tmp, - run.Raw('&&'), - 'cd', '--', scratch_tmp, - run.Raw('&&'), - run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'), - run.Raw('CEPH_REF={ref}'.format(ref=refspec)), - run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)), - run.Raw('CEPH_ID="{id}"'.format(id=id_)), - ] - if env is not None: - for var, val in env.iteritems(): - quoted_val = pipes.quote(val) - env_arg = '{var}={val}'.format(var=var, val=quoted_val) - args.append(run.Raw(env_arg)) - args.extend([ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir)]) - if timeout and timeout != '0': - args.extend(['timeout', timeout]) - args.extend([ - '{srcdir}/{workunit}'.format( - srcdir=srcdir, - workunit=workunit, - ), - ]) - remote.run( - logger=log.getChild(role), - args=args, - ) - remote.run( - logger=log.getChild(role), - args=['sudo', 'rm', '-rf', '--', scratch_tmp], - ) - finally: - log.info('Stopping %s on %s...', tests, role) - remote.run( - logger=log.getChild(role), - args=[ - 'rm', '-rf', '--', '{tdir}/workunits.list'.format(tdir=testdir), srcdir, - ], - ) diff --git a/teuthology/task_util/__init__.py b/teuthology/task_util/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/teuthology/task_util/rados.py b/teuthology/task_util/rados.py deleted file mode 100644 index 63a0848281..0000000000 --- a/teuthology/task_util/rados.py +++ /dev/null @@ -1,78 +0,0 @@ -import logging - -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -def rados(ctx, remote, cmd, wait=True, check_status=False): - testdir = teuthology.get_testdir(ctx) - log.info("rados %s" % ' '.join(cmd)) - pre = [ - 'adjust-ulimits', - 'ceph-coverage', - '{tdir}/archive/coverage'.format(tdir=testdir), - 'rados', - ]; - pre.extend(cmd) - proc = remote.run( - args=pre, - check_status=check_status, - wait=wait, - ) - if wait: - return proc.exitstatus - else: - return proc - -def create_ec_pool(remote, name, profile_name, pgnum, profile={}): - remote.run(args=['ceph'] + cmd_erasure_code_profile(profile_name, profile)) - remote.run(args=[ - 'ceph', 'osd', 'pool', 'create', name, - str(pgnum), str(pgnum), 'erasure', profile_name, - ]) - -def create_replicated_pool(remote, name, pgnum): - remote.run(args=[ - 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum), - ]) - -def create_cache_pool(remote, base_name, cache_name, pgnum, size): - remote.run(args=[ - 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum) - ]) - remote.run(args=[ - 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name, - str(size), - ]) - -def cmd_erasure_code_profile(profile_name, profile): - """ - Return the shell command to run to create the erasure code profile - described by the profile parameter. - - :param profile_name: a string matching [A-Za-z0-9-_.]+ - :param profile: a map whose semantic depends on the erasure code plugin - :returns: a shell command as an array suitable for Remote.run - - If profile is {}, it is replaced with - - { 'k': '2', 'm': '1', 'ruleset-failure-domain': 'osd'} - - for backward compatibility. In previous versions of teuthology, - these values were hardcoded as function arguments and some yaml - files were designed with these implicit values. The teuthology - code should not know anything about the erasure code profile - content or semantic. The valid values and parameters are outside - its scope. - """ - - if profile == {}: - profile = { - 'k': '2', - 'm': '1', - 'ruleset-failure-domain': 'osd' - } - return [ - 'osd', 'erasure-code-profile', 'set', - profile_name - ] + [ str(key) + '=' + str(value) for key, value in profile.iteritems() ] diff --git a/teuthology/task_util/rgw.py b/teuthology/task_util/rgw.py deleted file mode 100644 index cbe3071fbe..0000000000 --- a/teuthology/task_util/rgw.py +++ /dev/null @@ -1,153 +0,0 @@ -from cStringIO import StringIO -import logging -import json -import requests -from urlparse import urlparse - -from ..orchestra.connection import split_user -from teuthology import misc as teuthology - -log = logging.getLogger(__name__) - -# simple test to indicate if multi-region testing should occur -def multi_region_enabled(ctx): - # this is populated by the radosgw-agent task, seems reasonable to - # use that as an indicator that we're testing multi-region sync - return 'radosgw_agent' in ctx - -def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False): - log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd)) - testdir = teuthology.get_testdir(ctx) - pre = [ - 'adjust-ulimits', - 'ceph-coverage'.format(tdir=testdir), - '{tdir}/archive/coverage'.format(tdir=testdir), - 'radosgw-admin'.format(tdir=testdir), - '--log-to-stderr', - '--format', 'json', - '-n', client, - ] - pre.extend(cmd) - log.info('rgwadmin: cmd=%s' % pre) - (remote,) = ctx.cluster.only(client).remotes.iterkeys() - proc = remote.run( - args=pre, - check_status=check_status, - stdout=StringIO(), - stderr=StringIO(), - stdin=stdin, - ) - r = proc.exitstatus - out = proc.stdout.getvalue() - j = None - if not r and out != '': - try: - j = json.loads(out) - log.info(' json result: %s' % j) - except ValueError: - j = out - log.info(' raw result: %s' % j) - return (r, j) - -def get_zone_host_and_port(ctx, client, zone): - _, region_map = rgwadmin(ctx, client, check_status=True, - cmd=['-n', client, 'region-map', 'get']) - regions = region_map['regions'] - for region in regions: - for zone_info in region['val']['zones']: - if zone_info['name'] == zone: - endpoint = urlparse(zone_info['endpoints'][0]) - host, port = endpoint.hostname, endpoint.port - if port is None: - port = 80 - return host, port - assert False, 'no endpoint for zone {zone} found'.format(zone=zone) - -def get_master_zone(ctx, client): - _, region_map = rgwadmin(ctx, client, check_status=True, - cmd=['-n', client, 'region-map', 'get']) - regions = region_map['regions'] - for region in regions: - is_master = (region['val']['is_master'] == "true") - log.info('region={r} is_master={ism}'.format(r=region, ism=is_master)) - if not is_master: - continue - master_zone = region['val']['master_zone'] - log.info('master_zone=%s' % master_zone) - for zone_info in region['val']['zones']: - if zone_info['name'] == master_zone: - return master_zone - log.info('couldn\'t find master zone') - return None - -def get_master_client(ctx, clients): - master_zone = get_master_zone(ctx, clients[0]) # can use any client for this as long as system configured correctly - if not master_zone: - return None - - for client in clients: - zone = zone_for_client(ctx, client) - if zone == master_zone: - return client - - return None - -def get_zone_system_keys(ctx, client, zone): - _, zone_info = rgwadmin(ctx, client, check_status=True, - cmd=['-n', client, - 'zone', 'get', '--rgw-zone', zone]) - system_key = zone_info['system_key'] - return system_key['access_key'], system_key['secret_key'] - -def zone_for_client(ctx, client): - ceph_config = ctx.ceph.conf.get('global', {}) - ceph_config.update(ctx.ceph.conf.get('client', {})) - ceph_config.update(ctx.ceph.conf.get(client, {})) - return ceph_config.get('rgw zone') - -def region_for_client(ctx, client): - ceph_config = ctx.ceph.conf.get('global', {}) - ceph_config.update(ctx.ceph.conf.get('client', {})) - ceph_config.update(ctx.ceph.conf.get(client, {})) - return ceph_config.get('rgw region') - -def radosgw_data_log_window(ctx, client): - ceph_config = ctx.ceph.conf.get('global', {}) - ceph_config.update(ctx.ceph.conf.get('client', {})) - ceph_config.update(ctx.ceph.conf.get(client, {})) - return ceph_config.get('rgw data log window', 30) - -def radosgw_agent_sync_data(ctx, agent_host, agent_port, full=False): - log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port)) - method = "full" if full else "incremental" - return requests.post('http://{addr}:{port}/data/{method}'.format(addr = agent_host, port = agent_port, method = method)) - -def radosgw_agent_sync_metadata(ctx, agent_host, agent_port, full=False): - log.info('sync agent {h}:{p}'.format(h=agent_host, p=agent_port)) - method = "full" if full else "incremental" - return requests.post('http://{addr}:{port}/metadata/{method}'.format(addr = agent_host, port = agent_port, method = method)) - -def radosgw_agent_sync_all(ctx, full=False, data=False): - if ctx.radosgw_agent.procs: - for agent_client, c_config in ctx.radosgw_agent.config.iteritems(): - zone_for_client(ctx, agent_client) - sync_host, sync_port = get_sync_agent(ctx, agent_client) - log.debug('doing a sync via {host1}'.format(host1=sync_host)) - radosgw_agent_sync_metadata(ctx, sync_host, sync_port, full) - if (data): - radosgw_agent_sync_data(ctx, sync_host, sync_port, full) - -def host_for_role(ctx, role): - for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']): - if role in roles: - _, host = split_user(target) - return host - -def get_sync_agent(ctx, source): - for task in ctx.config['tasks']: - if 'radosgw-agent' not in task: - continue - for client, conf in task['radosgw-agent'].iteritems(): - if conf['src'] == source: - return host_for_role(ctx, source), conf.get('port', 8000) - return None, None diff --git a/teuthology/task_util/test/__init__.py b/teuthology/task_util/test/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/teuthology/task_util/test/test_rados.py b/teuthology/task_util/test/test_rados.py deleted file mode 100644 index ee1cfa62ab..0000000000 --- a/teuthology/task_util/test/test_rados.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# The MIT License -# -# Copyright (C) 2014 Cloudwatt -# -# Author: Loic Dachary -# -# Permission is hereby granted, free of charge, to any person -# obtaining a copy of this software and associated documentation -# files (the "Software"), to deal in the Software without -# restriction, including without limitation the rights to use, -# copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following -# conditions: -# -# The above copyright notice and this permission notice shall be -# included in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -from .. import rados - -class TestRados(object): - - def test_cmd_erasure_code_profile(self): - name = 'NAME' - cmd = rados.cmd_erasure_code_profile(name, {}) - assert 'k=2' in cmd - assert name in cmd - cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' }) - assert 'k=88' in cmd - assert name in cmd -- 2.39.5