qa/suites/rados: replace mon_seesaw.py task with a small bash script

author Sage Weil <sage@redhat.com>

Fri, 21 Dec 2018 18:26:29 +0000 (12:26 -0600)

committer Sage Weil <sage@redhat.com>

Thu, 3 Jan 2019 17:17:31 +0000 (11:17 -0600)
author Sage Weil <sage@redhat.com>
Fri, 21 Dec 2018 18:26:29 +0000 (12:26 -0600)
committer Sage Weil <sage@redhat.com>
Thu, 3 Jan 2019 17:17:31 +0000 (11:17 -0600)
diff --git a/qa/standalone/mon/mon-seesaw.sh b/qa/standalone/mon/mon-seesaw.sh

new file mode 100755 (executable)

index 0000000..1c97847
--- /dev/null
+++ b/qa/standalone/mon/mon-seesaw.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+
+source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
+    export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
+    export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+
+    export BASE_CEPH_ARGS=$CEPH_ARGS
+    CEPH_ARGS+="--mon-host=$CEPH_MON_A "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_mon_seesaw() {
+    local dir=$1
+
+    setup $dir || return
+
+    # start with 1 mon
+    run_mon $dir aa --public-addr $CEPH_MON_A || return 1
+    run_mgr $dir x || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+
+    wait_for_quorum 300 1 || return 1
+
+    # add in a second
+    run_mon $dir bb --public-addr $CEPH_MON_B || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
+    wait_for_quorum 300 2 || return 1
+
+    # remove the first one
+    ceph mon rm aa || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_B"
+    sleep 5
+    wait_for_quorum 300 1 || return 1
+
+    # do some stuff that requires the osds be able to communicate with the
+    # mons.  (see http://tracker.ceph.com/issues/17558)
+    ceph osd pool create foo 8
+    rados -p foo bench 1 write
+    wait_for_clean || return 1
+
+    # nuke monstore so that it will rejoin (otherwise we get
+    # "not in monmap and have been in a quorum before; must have been removed"
+    rm -rf $dir/aa
+
+    # add a back in
+    # (use a different addr to avoid bind issues)
+    run_mon $dir aa --public-addr $CEPH_MON_C || return 1
+    CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_C,$CEPH_MON_B"
+    wait_for_quorum 300 2 || return 1
+}
+
+main mon-seesaw "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/mon/mon-ping.sh"
+# End:
diff --git a/qa/suites/rados/singleton/all/mon-seesaw.yaml b/qa/suites/rados/singleton/all/mon-seesaw.yaml

deleted file mode 100644 (file)

index 587d8c2..0000000
--- a/qa/suites/rados/singleton/all/mon-seesaw.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-roles:
-- - mon.a
-  - mgr.x
-  - osd.0
-  - osd.1
-  - osd.2
-openstack:
-  - volumes: # attached to each instance
-      count: 3
-      size: 10 # GB
-tasks:
-- install:
-- ceph:
-    config:
-      global:
-        osd pool default min size : 1
-      osd:
-        debug monc: 1
-        debug ms: 1
-    log-whitelist:
-      - overall HEALTH
-      - Manager daemon
-      - \(MGR_DOWN\)
-      - \(PG_AVAILABILITY\)
-- mon_seesaw:
-- ceph_manager.create_pool:
-    kwargs:
-      pool_name: test
-      pg_num: 1
-- ceph_manager.wait_for_clean:
-    kwargs:
-      timeout: 60
diff --git a/qa/suites/rados/standalone/workloads/mon-seesaw.yaml b/qa/suites/rados/standalone/workloads/mon-seesaw.yaml

new file mode 100644 (file)

index 0000000..c36d6a3
--- /dev/null
+++ b/qa/suites/rados/standalone/workloads/mon-seesaw.yaml
@@ -0,0 +1,18 @@
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - osd.2
+  - client.0
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 10 # GB
+tasks:
+- install:
+- workunit:
+    basedir: qa/standalone
+    clients:
+      all:
+        - mon/mon-seesaw.sh
diff --git a/qa/tasks/mon_seesaw.py b/qa/tasks/mon_seesaw.py

deleted file mode 100644 (file)

index 4b2684e..0000000
--- a/qa/tasks/mon_seesaw.py
+++ /dev/null
@@ -1,198 +0,0 @@
-from cStringIO import StringIO
-
-import contextlib
-import logging
-import random
-
-from teuthology import misc as teuthology
-from teuthology.orchestra import run
-
-from ceph_manager import CephManager, write_conf
-
-
-log = logging.getLogger(__name__)
-
-
-def _get_mons(ctx):
-    return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
-
-
-# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
-# enumerate all monitor ports ([6789..]), and find the next available one.
-def _get_next_port(ctx, ip, cluster):
-    # assuming we have only one cluster here.
-    used = []
-    for name in teuthology.get_mon_names(ctx, cluster):
-        addr = ctx.ceph[cluster].conf[name]['mon addr']
-        addr_type, mon_ip, mon_port = addr.split(':')
-        if mon_ip != ip:
-            continue
-        used.append(int(mon_port))
-    port = 6789
-    used.sort()
-    for p in used:
-        if p != port:
-            break
-        port += 1
-    return port
-
-
-def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
-    # co-locate a new monitor on remote where an existing monitor is hosted
-    cluster = manager.cluster
-    remote.run(args=['sudo', 'mkdir', '-p', data_path])
-    keyring_path = '/etc/ceph/{cluster}.keyring'.format(
-        cluster=manager.cluster)
-    testdir = teuthology.get_testdir(ctx)
-    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
-                                                   cluster=cluster)
-    manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
-    if manager.controller != remote:
-        monmap = teuthology.get_file(manager.controller, monmap_path)
-        teuthology.write_file(remote, monmap_path, StringIO(monmap))
-    remote.run(
-        args=[
-            'sudo',
-            'ceph-mon',
-            '--cluster', cluster,
-            '--mkfs',
-            '-i', mon,
-            '--monmap', monmap_path,
-            '--keyring', keyring_path])
-    if manager.controller != remote:
-        teuthology.delete_file(remote, monmap_path)
-    # raw_cluster_cmd() is performed using sudo, so sudo here also.
-    teuthology.delete_file(manager.controller, monmap_path, sudo=True)
-    # update ceph.conf so that the ceph CLI is able to connect to the cluster
-    if conf_path:
-        ip = remote.ip_address
-        port = _get_next_port(ctx, ip, cluster)
-        mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
-        ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
-        write_conf(ctx, conf_path, cluster)
-
-
-def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
-    cluster = manager.cluster
-    del ctx.ceph[cluster].conf[name]
-    write_conf(ctx, conf_path, cluster)
-    remote.run(args=['sudo', 'rm', '-rf', data_path])
-
-
-@contextlib.contextmanager
-def _prepare_mon(ctx, manager, remote, mon):
-    cluster = manager.cluster
-    data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
-        cluster=cluster, id=mon)
-    conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
-    name = 'mon.{0}'.format(mon)
-    _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
-    yield
-    _teardown_mon(ctx, manager, remote, name,
-                  data_path, conf_path)
-
-
-# run_daemon() in ceph.py starts a herd of daemons of the same type, but
-# _run_daemon() starts only one instance.
-@contextlib.contextmanager
-def _run_daemon(ctx, remote, cluster, type_, id_):
-    testdir = teuthology.get_testdir(ctx)
-    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
-    daemon_signal = 'kill'
-    run_cmd = [
-        'sudo',
-        'adjust-ulimits',
-        'ceph-coverage',
-        coverage_dir,
-        'daemon-helper',
-        daemon_signal,
-    ]
-    run_cmd_tail = [
-        'ceph-%s' % (type_),
-        '-f',
-        '--cluster', cluster,
-        '-i', id_]
-    run_cmd.extend(run_cmd_tail)
-    ctx.daemons.add_daemon(remote, type_, id_,
-                           cluster=cluster,
-                           args=run_cmd,
-                           logger=log.getChild(type_),
-                           stdin=run.PIPE,
-                           wait=False)
-    daemon = ctx.daemons.get_daemon(type_, id_, cluster)
-    yield daemon
-    daemon.stop()
-
-
-@contextlib.contextmanager
-def task(ctx, config):
-    """
-    replace a monitor with a newly added one, and then revert this change
-
-    How it works::
-    1. add a mon with specified id (mon.victim_prime)
-    2. wait for quorum
-    3. remove a monitor with specified id (mon.victim), mon.victim will commit
-       suicide
-    4. wait for quorum
-    5. <yield>
-    5. add mon.a back, and start it
-    6. wait for quorum
-    7. remove mon.a_prime
-
-    Options::
-    victim       the id of the mon to be removed (pick a random mon by default)
-    replacer     the id of the new mon (use "${victim}_prime" if not specified)
-    """
-    first_mon = teuthology.get_first_mon(ctx, config)
-    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-    manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
-
-    if config is None:
-        config = {}
-    assert isinstance(config, dict), \
-        "task ceph only supports a dictionary for configuration"
-    overrides = ctx.config.get('overrides', {})
-    teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
-    victim = config.get('victim', random.choice(_get_mons(ctx)))
-    replacer = config.get('replacer', '{0}_prime'.format(victim))
-    remote = manager.find_remote('mon', victim)
-    quorum = manager.get_mon_quorum()
-    cluster = manager.cluster
-    log.info('replacing {victim} with {replacer}'.format(victim=victim,
-                                                         replacer=replacer))
-    with _prepare_mon(ctx, manager, remote, replacer):
-        with _run_daemon(ctx, remote, cluster, 'mon', replacer):
-            # replacer will join the quorum automatically
-            manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
-            # if we don't remove the victim from monmap, there is chance that
-            # we are leaving the new joiner with a monmap of 2 mon, and it will
-            # not able to reach the other one, it will be keeping probing for
-            # ever.
-            log.info('removing {mon}'.format(mon=victim))
-            manager.raw_cluster_cmd('mon', 'remove', victim)
-            manager.wait_for_mon_quorum_size(len(quorum), 10)
-            # the victim will commit suicide after being removed from
-            # monmap, let's wait until it stops.
-            ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
-            try:
-                # perform other tasks
-                yield
-            finally:
-                # bring the victim back online
-                # nuke the monstore of victim, otherwise it will refuse to boot
-                # with following message:
-                #
-                # not in monmap and have been in a quorum before; must have
-                # been removed
-                log.info('re-adding {mon}'.format(mon=victim))
-                data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
-                    cluster=cluster, id=victim)
-                remote.run(args=['sudo', 'rm', '-rf', data_path])
-                name = 'mon.{0}'.format(victim)
-                _setup_mon(ctx, manager, remote, victim, name, data_path, None)
-                log.info('reviving {mon}'.format(mon=victim))
-                manager.revive_mon(victim)
-                manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
-                manager.raw_cluster_cmd('mon', 'remove', replacer)
-                manager.wait_for_mon_quorum_size(len(quorum), 10)
author	Sage Weil <sage@redhat.com>
	Fri, 21 Dec 2018 18:26:29 +0000 (12:26 -0600)
committer	Sage Weil <sage@redhat.com>
	Thu, 3 Jan 2019 17:17:31 +0000 (11:17 -0600)
qa/standalone/mon/mon-seesaw.sh	[new file with mode: 0755]	patch \| blob
qa/suites/rados/singleton/all/mon-seesaw.yaml	[deleted file]	patch \| blob \| history
qa/suites/rados/standalone/workloads/mon-seesaw.yaml	[new file with mode: 0644]	patch \| blob
qa/tasks/mon_seesaw.py	[deleted file]	patch \| blob \| history