--- /dev/null
+roles:
+- [mon.a, mds.a, mgr.z]
+- [mon.b, mds.b, mgr.y]
+- [mon.c, mds.c, mgr.x]
+- [osd.0, osd.1, osd.2]
+- [osd.3, osd.4, osd.5]
+- [osd.6, osd.7, osd.8]
+- [osd.9, osd.10, osd.11]
+- [osd.12, osd.13, osd.14]
+- [rgw.0, client.0]
+- [rgw.1, client.1]
--- /dev/null
+tasks:
+- ssh-keys: null
+- ceph-ansible: null
--- /dev/null
+tasks:
+- parallel:
+ - mixed_system_test.rgw_ios:
+ test: Mbuckets_with_Nobjects
+ script: test_Mbuckets_with_Nobjects.py
+ clients: ['client.0']
+ config:
+ user_count: 5
+ bucket_count: 5
+ objects_count: 20
+ objects_size_range:
+ min: 100
+ max: 200
+ test_ops:
+ create_bucket: true
+ create_object: true
+ download_object: true
+ delete_bucket_object: true
+ sharding:
+ enable: false
+ max_shards: 0
+ compression:
+ enable: false
+ type: zlib
+ - mixed_system_test.rgw_ios:
+ test: Mbuckets_with_Nobjects_multipart
+ script: test_Mbuckets_with_Nobjects.py
+ clients: ['client.0']
+ config:
+ user_count: 2
+ bucket_count: 5
+ objects_count: 10
+ objects_size_range:
+ min: 1000
+ max: 1500
+ test_ops:
+ create_bucket: true
+ create_object: true
+ download_object: true
+ delete_bucket_object: true
+ upload_type: multipart
+ sharding:
+ enable: false
+ max_shards: 0
+ compression:
+ enable: false
+ type: zlib
+ - mixed_system_test.rgw_ios:
+ test: versioning_ops
+ script: test_versioning_with_objects.py
+ clients: ['client.0']
+ config:
+ user_count: 2
+ bucket_count: 2
+ objects_count: 10
+ version_count: 5
+ objects_size_range:
+ min: 50
+ max: 80
+ test_ops:
+ enable_version: true
+ suspend_version: true
+ copy_to_version: false
+ delete_object_versions: false
+ upload_after_suspend: true
+ - mixed_system_test.rgw_ios:
+ test: versioning_ops_delete
+ script: test_versioning_with_objects.py
+ clients: ['client.0']
+ config:
+ user_count: 2
+ bucket_count: 2
+ objects_count: 10
+ version_count: 5
+ objects_size_range:
+ min: 50
+ max: 80
+ test_ops:
+ enable_version: true
+ suspend_version: false
+ copy_to_version: false
+ delete_object_versions: true
+ upload_after_suspend: false
+ - rbd_fio:
+ client.0:
+ fio-io-size: 100%
+ formats: [2]
+ features: [[layering],[layering,exclusive-lock,object-map]]
+ io-engine: rbd
+ test-clone-io: 1
+ rw: randrw
+ runtime: 600
+ - mixed_system_test.restart_tests:
+ daemons: ["mgr", "mon", "osd"]
--- /dev/null
+import contextlib
+import logging
+
+from tasks.mixed_system_tests import system
+from tasks.mixed_system_tests import ios
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def rgw_ios(ctx, config):
+ """
+ Task to run RGW IO's using ceph-QE-scripts repo.
+ Args:
+ ctx: cluster obj
+ config: test data
+ example:
+ tasks:
+ - rgw-system-test:
+ test: <test-name>
+ script: <script-name> | default value is <test-name>.py
+ test_version: <test-version> | ex: v1 or v2, default value is v2
+ clients: <clients list> | ex: [client.0, client.1]
+ default value is ['client.0]
+ config: <configuration of the test-name> |
+ default values is the yaml file config from ceph-qe-scripts
+ """
+ rgw_ios_internal = ios.rgw_ios(ctx, config)
+ try:
+ rgw_ios_internal.__enter__()
+ yield
+ except Exception as err:
+ log.info(err)
+ assert False, err
+ finally:
+ rgw_ios_internal.__exit__()
+
+
+@contextlib.contextmanager
+def restart_tests(ctx, config):
+ """
+ Perform restart test scenarios based on the daemon sequentially
+ 1) stop daemon.
+ a. verify IO & cluster health.
+ b. start daemon and wait for cluster status to be healthy.
+ 2) restart daemon.
+ a. verify IO & cluster health.
+ b. wait for cluster status to be healthy.
+ 3) reboot daemon node.
+ a. verify IO & cluster health.
+ b. wait for node up & running, and cluster status to be healthy.
+ Args:
+ ctx: context obj
+ config: test configuration
+ example:
+ mixed_system_test.restart_tests:
+ config:
+ daemon: ["mon", "mgr", "mds"]
+ """
+ daemons = config.get('daemons')
+ try:
+ for daemon in daemons:
+ assert system.ceph_daemon_system_test(ctx, daemon)
+ log.info("{} completed".format(daemon))
+ yield
+ except Exception as err:
+ assert False, err
+ finally:
+ log.info("Daemon(s) Service system tests completed")
--- /dev/null
+"""
+Add your constants here
+
+"""
+
+cephqe_scripts = {
+ "DIR": {"v1": {"script": "rgw/v1/tests/s3/", "config": "rgw/v1/tests/s3/yamls"},
+ "v2": {"script": "rgw/v2/tests/s3_swift/", "config": "rgw/v2/tests/s3_swift/configs"}
+ },
+ "MASTER_BRANCH": "master",
+ "REPO_NAME": "ceph-qe-scripts",
+ "WIP_BRANCH": None
+}
+
+HEALTH = {
+ "error": "HEALTH_ERR",
+ "warn": "HEALTH_WARN",
+ "good": "HEALTH_OK"
+}
--- /dev/null
+"""
+Component IOs
+"""
+
+import yaml
+import logging
+import os
+import pwd
+import time
+
+from teuthology.orchestra import run
+from constants import cephqe_scripts
+
+log = logging.getLogger(__name__)
+
+
+class rgw_ios:
+ """
+ RGW IOS using ceph-qe-scripts
+ """
+
+ def __init__(self, ctx, config):
+ self.ctx = ctx
+ self.config = config
+
+ def __enter__(self):
+ log.info('starting rgw-tests')
+ log.info('config %s' % self.config)
+ if self.config is None:
+ self.config = {}
+ assert isinstance(self.config, dict), \
+ "task set-repo only supports a dictionary for configuration"
+ config_file_name = self.config['test'] + ".yaml"
+ log.info('test_version: %s' % self.config.get('test_version', 'v2'))
+ log.info('test: %s' % self.config['test'])
+ log.info('script: %s' % self.config.get('script', self.config['test'] + ".py"))
+ test_root_dir = self.config['test'] + "_%d" % int(time.time())
+ test_venv = os.path.join(test_root_dir, "venv")
+ script = os.path.join(cephqe_scripts['REPO_NAME'],
+ cephqe_scripts['DIR'][self.config.get('test_version', 'v2')]['script'],
+ self.config.get('script', self.config['test'] + ".py"))
+ config_file = os.path.join(cephqe_scripts['REPO_NAME'],
+ cephqe_scripts['DIR'][self.config.get('test_version', 'v2')]['config'],
+ config_file_name)
+ log.info('script: %s' % script)
+ log.info('config_file: %s' % config_file)
+ self.soot = [test_venv, test_root_dir, 'io_info.yaml', '*.json', 'Download.*',
+ 'Download', '*.mpFile', 'x*', 'key.*', 'Mp.*', '*.key.*']
+ self.cleanup = lambda x: remote.run(args=[run.Raw('sudo rm -rf %s' % x)])
+ log.info('listing all clients: %s' % self.config.get('clients'))
+ for role in self.config.get('clients', ['client.0']):
+ wip_branch = cephqe_scripts["WIP_BRANCH"]
+ master_branch = cephqe_scripts["MASTER_BRANCH"]
+ assert isinstance(role, basestring)
+ prefix = 'client.'
+ assert role.startswith(prefix)
+ id_ = role[len(prefix):]
+ (remote,) = self.ctx.cluster.only(role).remotes.iterkeys()
+ map(self.cleanup, self.soot)
+ remote.run(args=['mkdir', test_root_dir])
+ log.info('cloning the repo to %s' % remote.hostname)
+ remote.run(
+ args=[
+ 'cd',
+ '%s' % test_root_dir,
+ run.Raw(';'),
+ 'git',
+ 'clone',
+ 'https://github.com/red-hat-storage/ceph-qe-scripts.git',
+ '-b',
+ '%s' % master_branch if wip_branch is None else wip_branch
+ ])
+ if self.config.get('config', None) is not None:
+ test_config = {'config': self.config.get('config')}
+ log.info('config: %s' % test_config)
+ log.info('creating configuration from data: %s' % test_config)
+ local_file = os.path.join('/tmp/',
+ config_file_name +
+ "_" + str(os.getpid()) +
+ pwd.getpwuid(os.getuid()).pw_name)
+ with open(local_file, 'w') as outfile:
+ outfile.write(yaml.dump(test_config, default_flow_style=False))
+ out = remote.run(args=[run.Raw('sudo echo $HOME')],
+ wait=False,
+ stdout=run.PIPE)
+ out = out.stdout.read().strip()
+ conf_file = os.path.join(out, test_root_dir, config_file)
+ log.info('local_file: %s' % local_file)
+ log.info('config_file: %s' % conf_file)
+ log.info('copying temp yaml to the client node')
+ remote.put_file(local_file, conf_file)
+ remote.run(args=['ls', '-lt', os.path.dirname(conf_file)])
+ remote.run(args=['cat', conf_file])
+ os.remove(local_file)
+ remote.run(args=['python3', '-m', 'venv', test_venv])
+ remote.run(
+ args=[
+ 'source',
+ '{}/bin/activate'.format(test_venv),
+ run.Raw(';'),
+ run.Raw('pip3 install boto boto3 names PyYaml ConfigParser'),
+ run.Raw(';'),
+ 'deactivate'])
+
+ time.sleep(60)
+ log.info('trying to restart rgw service after sleep 60 secs')
+ out = remote.run(args=[run.Raw('sudo systemctl is-active ceph-radosgw.target')],
+ wait=False,
+ stdout=run.PIPE)
+ try:
+ out = out.stdout.read().strip()
+ except AttributeError:
+ out = "inactive"
+ if "inactive" in out:
+ log.info('Restarting RGW service')
+ remote.run(args=[run.Raw('sudo systemctl restart ceph-radosgw.target')])
+ log.info('starting the tests after sleep of 60 secs')
+ time.sleep(60)
+ remote.run(
+ args=[run.Raw('sudo cd %s ' % test_root_dir)])
+ remote.run(args=[
+ run.Raw('cd %s; sudo venv/bin/python3 %s -c %s ' % (test_root_dir,
+ script,
+ config_file))])
+
+ def __exit__(self):
+ for role in self.config.get('clients', ['client.0']):
+ (remote,) = self.ctx.cluster.only(role).remotes.iterkeys()
+ log.info('Test completed')
+ log.info("Deleting leftovers")
+ map(self.cleanup, self.soot)
--- /dev/null
+"""
+System tests
+"""
+import logging
+from time import sleep
+from constants import HEALTH
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+def check_service_status(ctx, dstate, **args):
+ """
+ check service status and cluster health_status
+ Args:
+ ctx: ceph context obj
+ dstate: daemon state obj
+ args: arguments
+ (ex., timeout: 120(default)
+ state: Health states list (ex., [HEALTH_ERR, HEALTH_WARN])
+ exit_status: exit status
+ check: true)
+ """
+ try:
+ # Check daemon restart/start status
+ timeout = 120
+ interval = 5
+
+ if args.get('timeout'):
+ timeout = args['timeout']
+
+ iterations = timeout / interval
+ exit_status = args.get('exit_status')
+
+ while iterations:
+ log.info("Check {} {} daemon status".format(dstate.role,
+ dstate.id_))
+ if dstate.check_status() is not exit_status:
+ log.warn("{} {} is still not {}".format(dstate.role,
+ dstate.id_, exit_status))
+ sleep(interval)
+ iterations -= 1
+ continue
+ break
+ else:
+ assert False
+
+ # check cluster health
+ cluster = ctx.managers.keys()[0]
+ check_status = args.get('check_status', False)
+ check_key = args.get('check_keys')
+ health_state = args.get('state')
+
+ while timeout:
+ sleep(interval)
+ timeout -= interval
+ cluster_status = ctx.managers[cluster].raw_cluster_status()
+ health = cluster_status.get('health')
+ status = health['status']
+ checks = health['checks']
+
+ try:
+ if check_status:
+ assert status in health_state, \
+ "[ {} ] not found in health status {}".format(health_state, status)
+ log.info(" Cluster health status : {} as expected".format(status))
+ if check_key:
+ check_key = [check_key] if not isinstance(check_key, list) else check_key
+
+ for chk in check_key:
+ assert chk.upper() in checks, \
+ "[ {} ] not found in health checks {}".format(chk, checks)
+ log.info("[ {} ] found in cluster health checks as expected".format(chk))
+ log.info(" Cluster health status : {}".format(checks))
+ return health
+ except AssertionError as err:
+ log.warn(err)
+ log.warn("Retrying with {} seconds left".format(timeout))
+ continue
+ else:
+ assert False, "[ {} ] not found in health checks".format(health_state)
+ except AssertionError:
+ assert False
+
+
+def reboot_node(dstate, **args):
+ """
+ Reboot daemon node
+ Args:
+ dstate: daemon dstate
+ args: reboot arguments(ex., timeout=300, interval=30)
+ """
+ timeout = 600
+ interval = 30
+
+ if args.get('timeout'):
+ timeout = args['timeout']
+ if args.get('interval'):
+ interval = args['interval']
+
+ try:
+ # reboot node
+ dstate.remote.run(args=["sudo", "shutdown", "-r", "now", run.Raw("&")])
+
+ # wait for ssh reconnection
+ assert dstate.remote.reconnect(timeout=timeout, sleep_time=interval),\
+ " [ {} ] Reboot failed".format(dstate.id_)
+ log.info(" [ {} ] Reboot successful".format(dstate.id_))
+ return True
+ except AssertionError as err:
+ assert False, err
+
+
+def ceph_daemon_system_test(ctx, daemon):
+ """
+ Perform sequential actions on daemon.
+ 1) stop daemon, check IO and cluster status
+ 2) re/start daemon, check IO and cluster status
+ 3) reboot node, check IO and cluster
+ Args:
+ ctx: ceph context obj
+ daemon: ceph daemon
+ """
+ daemon = "ceph.%s" % daemon.lower() \
+ if not daemon.lower().startswith("ceph.") else daemon
+
+ kwargs = {
+ "timeout": 120,
+ "exit_status": None,
+ "state": None,
+ "check_status": True,
+ "verify_status": None,
+ "check_keys": None
+ }
+
+ try:
+ # Get daemon nodes with SystemDState obj from ctx
+ daemons = ctx.daemons.daemons.get(daemon)
+ for name, dstate in daemons.items():
+ # stop and verify the cluster status
+ dstate.stop()
+ kwargs['exit_status'] = 0
+ kwargs['state'] = [HEALTH['warn']]
+ kwargs['check_keys'] = "{}_down".format(dstate.daemon_type)
+
+ check_service_status(ctx, dstate, **kwargs)
+
+ # start and verify the cluster status
+ dstate.restart()
+ kwargs['exit_status'] = None
+ kwargs['state'] = [HEALTH['warn'], HEALTH['good']]
+ kwargs['check_keys'] = None
+ check_service_status(ctx, dstate, **kwargs)
+
+ # restart daemon and verify cluster status
+ dstate.restart()
+ check_service_status(ctx, dstate, **kwargs)
+
+ # reboot daemon node and verify cluster status
+ reboot_node(dstate, timeout=600, interval=30)
+ log.info("[ ({}, {}) ] daemon system tests Completed".format(daemon, dstate.id_))
+ return True
+ except KeyError as err:
+ log.error("No {}(s) found".format(daemon))
+ assert False, err
+ finally:
+ log.info("Daemon service system tests Completed")