From 0e5ba6fce29967a1e55f6860613cfbd7649f8b22 Mon Sep 17 00:00:00 2001 From: Shilpa Jagannath Date: Fri, 13 Jul 2018 21:53:39 +0530 Subject: [PATCH] Task and yamls for testing delay and link failure/recovery with multisite --- .../multisite-ansible/tasks/test_delay.yaml | 48 ++++ .../tasks/test_delay_variable.yaml | 48 ++++ .../tasks/test_link_failure.yaml | 48 ++++ qa/tasks/netem.py | 230 ++++++++++++++++++ 4 files changed, 374 insertions(+) create mode 100644 qa/suites/rgw/multisite-ansible/tasks/test_delay.yaml create mode 100644 qa/suites/rgw/multisite-ansible/tasks/test_delay_variable.yaml create mode 100644 qa/suites/rgw/multisite-ansible/tasks/test_link_failure.yaml create mode 100644 qa/tasks/netem.py diff --git a/qa/suites/rgw/multisite-ansible/tasks/test_delay.yaml b/qa/suites/rgw/multisite-ansible/tasks/test_delay.yaml new file mode 100644 index 0000000000000..032c711b9885b --- /dev/null +++ b/qa/suites/rgw/multisite-ansible/tasks/test_delay.yaml @@ -0,0 +1,48 @@ +tasks: +- ssh-keys: null +- ceph-ansible: + vars: + cluster: c1 +- ceph-ansible: + vars: + cluster: c2 +- new-rgw-multisite: + realm: + name: test-realm + is_default: true + zonegroups: + - name: test-zg + is_master: true + is_default: true + zones: + - name: test-zone + is_master: true + is_default: true + endpoints: [c1.rgw.0] + - name: test-zone2 + is_default: true + endpoints: [c2.rgw.1] +- netem: + clients: [c1.rgw.0] + dst_client: c2.rgw.1 + delay: 30ms +- multisite-test.userexec: + master_client: c1.rgw.0 + test_dir_version: v1 + master_config: + cluster_name: c1 + user_count: 3 + target_client: c2.rgw.1 +- sleep: + duration: 60 +- multisite-test: + test-name: test_Mbuckets + test_dir_version: v1 + master_client: c1.rgw.0 + target_client: c2.rgw.1 + bucket_count: 5 +- sleep: + duration: 60 +- multisite-test.pull-io-info: +- verify-io: + verification_script: read_io_info \ No newline at end of file diff --git a/qa/suites/rgw/multisite-ansible/tasks/test_delay_variable.yaml b/qa/suites/rgw/multisite-ansible/tasks/test_delay_variable.yaml new file mode 100644 index 0000000000000..14ec8b29753ba --- /dev/null +++ b/qa/suites/rgw/multisite-ansible/tasks/test_delay_variable.yaml @@ -0,0 +1,48 @@ +tasks: +- ssh-keys: null +- ceph-ansible: + vars: + cluster: c1 +- ceph-ansible: + vars: + cluster: c2 +- new-rgw-multisite: + realm: + name: test-realm + is_default: true + zonegroups: + - name: test-zg + is_master: true + is_default: true + zones: + - name: test-zone + is_master: true + is_default: true + endpoints: [c1.rgw.0] + - name: test-zone2 + is_default: true + endpoints: [c2.rgw.1] +- netem: + clients: [c1.rgw.0] + dst_client: c2.rgw.1 + delay_range: [20ms, 40ms] +- multisite-test.userexec: + master_client: c1.rgw.0 + test_dir_version: v1 + master_config: + cluster_name: c1 + user_count: 3 + target_client: c2.rgw.1 +- sleep: + duration: 60 +- multisite-test: + test-name: test_Mbuckets + test_dir_version: v1 + master_client: c1.rgw.0 + target_client: c2.rgw.1 + bucket_count: 5 +- sleep: + duration: 60 +- multisite-test.pull-io-info: +- verify-io: + verification_script: read_io_info \ No newline at end of file diff --git a/qa/suites/rgw/multisite-ansible/tasks/test_link_failure.yaml b/qa/suites/rgw/multisite-ansible/tasks/test_link_failure.yaml new file mode 100644 index 0000000000000..03dfb0d8ba82b --- /dev/null +++ b/qa/suites/rgw/multisite-ansible/tasks/test_link_failure.yaml @@ -0,0 +1,48 @@ +tasks: +- ssh-keys: null +- ceph-ansible: + vars: + cluster: c1 +- ceph-ansible: + vars: + cluster: c2 +- new-rgw-multisite: + realm: + name: test-realm + is_default: true + zonegroups: + - name: test-zg + is_master: true + is_default: true + zones: + - name: test-zone + is_master: true + is_default: true + endpoints: [c1.rgw.0] + - name: test-zone2 + is_default: true + endpoints: [c2.rgw.1] +- netem: + clients: [c1.rgw.0] + dst_client: c2.rgw.1 + link_toggle_interval: 10 # no need to mention units. Dafault takes seconds. +- multisite-test.userexec: + master_client: c1.rgw.0 + test_dir_version: v1 + master_config: + cluster_name: c1 + user_count: 3 + target_client: c2.rgw.1 +- sleep: + duration: 60 +- multisite-test: + test-name: test_Mbuckets + test_dir_version: v1 + master_client: c1.rgw.0 + target_client: c2.rgw.1 + bucket_count: 5 +- sleep: + duration: 60 +- multisite-test.pull-io-info: +- verify-io: + verification_script: read_io_info \ No newline at end of file diff --git a/qa/tasks/netem.py b/qa/tasks/netem.py new file mode 100644 index 0000000000000..7cea45463969e --- /dev/null +++ b/qa/tasks/netem.py @@ -0,0 +1,230 @@ +""" +Task to run tests with network delay between two remotes using netem +""" + +import logging +import contextlib +from teuthology import misc as teuthology +from cStringIO import StringIO +from teuthology.orchestra import run +from teuthology.orchestra import remote as orchestra_remote +from teuthology import contextutil +from paramiko import SSHException +import socket +import time +import gevent + +log = logging.getLogger(__name__) + +def set_priority(): + + # create a priority queueing discipline + return ['sudo', 'tc', 'qdisc', 'add', 'dev', 'eno1', 'root', 'handle', '1:', 'prio'] + +def show_tc(): + + # shows tc device present + return ['sudo', 'tc', 'qdisc', 'show', 'dev', 'eno1'] + +def del_tc(): + + return ['sudo', 'tc', 'qdisc', 'del', 'dev', 'eno1', 'root'] + +def cmd_prefix(): + + # prepare command to set delay + cmd1 = ['sudo', 'tc', 'qdisc', 'add', 'dev', 'eno1', 'parent', + '1:1', 'handle', '2:', 'netem', 'delay'] + + # prepare command to change delay + cmd2 = ['sudo', 'tc', 'qdisc', 'replace', 'dev', 'eno1', 'root', 'netem', 'delay'] + + # prepare command to apply filter to the matched ip/host + + cmd3 = ['sudo', 'tc', 'filter', 'add', 'dev', 'eno1', + 'parent', '1:0', 'protocol', 'ip', 'pref', '55', + 'handle', '::55', 'u32', 'match', 'ip', 'dst'] + + return cmd1, cmd2, cmd3 + +def static_delay(remote, host, delay): + + set_delay, change_delay, set_ip = cmd_prefix() + + ip = socket.gethostbyname(host.hostname) + + r = remote.run(args=show_tc(), stdout=StringIO()) + if r.stdout.getvalue().strip().find('refcnt') == -1: + # call set_priority() func to create priority queue + # if not already created(indicated by -1) + log.info('Create priority queue') + remote.run(args=set_priority()) + + # set static delay, with +/- 5ms jitter with normal distribution as default + log.info('Setting delay to %s' % delay) + set_delay.extend(['%s' % delay, '5ms', 'distribution', 'normal']) + remote.run(args=set_delay) + + # set delay to a particular remote node via ip + log.info('Delay set on %s' % remote) + set_ip.extend(['%s' % ip, 'flowid', '2:1']) + remote.run(args=set_ip) + remote.run(args=show_tc(), stdout=StringIO()) + else: + # if the device is already created, only change the delay + log.info('Setting delay to %s' % delay) + change_delay.extend(['%s' % delay, '5ms', 'distribution', 'normal']) + remote.run(args=change_delay) + remote.run(args=show_tc(), stdout=StringIO()) + +def variable_delay(remote, host, delay_range=[]): + + """ Vary delay between two values""" + + set_delay, change_delay, set_ip = cmd_prefix() + + ip = socket.gethostbyname(host.hostname) + + # delay1 has to be lower than delay2 + delay1 = delay_range[0] + delay2 = delay_range[1] + + r = remote.run(args=show_tc(), stdout=StringIO()) + if r.stdout.getvalue().strip().find('refcnt') == -1: + # call set_priority() func to create priority queue + # if not already created(indicated by -1) + remote.run(args=set_priority()) + + # set variable delay + log.info('Setting varying delay') + set_delay.extend(['%s' % delay1, '%s' % delay2]) + remote.run(args=set_delay) + + # set delay to a particular remote node via ip + log.info('Delay set on %s' % remote) + set_ip.extend(['%s' % ip, 'flowid', '2:1']) + remote.run(args=set_ip) + remote.run(args=show_tc(), stdout=StringIO()) + else: + # if the device is already created, only change the delay + log.info('Setting varying delay') + change_delay.extend(['%s' % delay1, '%s' % delay2]) + remote.run(args=change_delay) + remote.run(args=show_tc(), stdout=StringIO()) + + +class Toggle: + + stop_event = gevent.event.Event() + + def __init__(self, remote, host, interval): + self.remote = remote + self.host = host + self.interval = interval + self.ip = socket.gethostbyname(self.host.hostname) + + def delete_dev(self): + + """ Delete the qdisc """ + + log.info('Delete tc') + self.remote.run(args=del_tc()) + + def packet_drop(self): + + """ Drop packets to the remote ip specified""" + + _, _, set_ip = cmd_prefix() + + r = self.remote.run(args=show_tc(), stdout=StringIO()) + if r.stdout.getvalue().strip().find('refcnt') == -1: + self.remote.run(args=set_priority()) + # packet drop to specific ip + log.info('Drop all packets to %s' % self.host) + set_ip.extend(['%s' % self.ip, 'action', 'drop']) + self.remote.run(args=set_ip) + + + def link_toggle(self): + # For toggling packet drop and recovery in regular interval. + # If interval is 5s, link is up for 5s and link is down for 5s + + while not self.stop_event.is_set(): + self.stop_event.wait(timeout=self.interval) + # simulate link down + try: + self.packet_drop() + log.info('link down') + except SSHException as e: + log.debug('Failed to run command') + + self.stop_event.wait(timeout=self.interval) + # if qdisc exist,delete it. + try: + self.delete_dev() + log.info('link up') + except SSHException as e: + log.debug('Failed to run command') + + def begin(self): + self.thread = gevent.spawn(self.link_toggle) + + def end(self): + self.stop_event.set() + self.thread.get() + + +@contextlib.contextmanager +def task(ctx, config): + + """ + - netem: + clients: [c1.rgw.0] + dst_client: [c2.rgw.1] + delay: 10ms + + - netem: + clients: [c1.rgw.0] + dst_client: [c2.rgw.1] + delay_range: [10ms, 20ms] # (min, max) + + - netem: + clients: [rgw.1, mon.0] + dst_client: [c2.rgw.1] + link_toggle_interval: 10 # no unit mentioned. By default takes seconds. + + + """ + + log.info('config %s' % config) + + assert isinstance(config, dict), \ + "please list clients to run on" + + dst = config.get('dst_client') + (host,) = ctx.cluster.only(dst).remotes.iterkeys() + + for role in config.get('clients'): + (remote,) = ctx.cluster.only(role).remotes.iterkeys() + if config.get('delay', False): + static_delay(remote, host, config.get('delay')) + if config.get('delay_range', False): + variable_delay(remote, host, config.get('delay_range')) + if config.get('link_toggle_interval', False): + log.info('Toggling link for %s' % config.get('link_toggle_interval')) + toggle = Toggle(remote, host, config.get('link_toggle_interval')) + toggle.begin() + + # t = threading.Thread(target=link_toggle(remote, config.get('dst_client'), config.get('interval'))) + # t.daemon = True + # t.start() + + try: + yield + finally: + if config.get('link_toggle_interval'): + toggle.end() + for role in config.get('clients'): + (remote,) = ctx.cluster.only(role).remotes.iterkeys() + remote.run(args=['sudo', 'tc', 'qdisc', 'del', 'eno1', 'root']) + -- 2.39.5