From de4587012d2d0d35f9d201550d58ffc2faeff27b Mon Sep 17 00:00:00 2001 From: Jason Dillaman Date: Tue, 24 Apr 2018 09:00:18 -0400 Subject: [PATCH] qa/suites/rbd: rbd-mirror fsx stress test Signed-off-by: Jason Dillaman --- .../rbd/mirror-thrash/cluster/2-node.yaml | 6 ++ qa/suites/rbd/mirror-thrash/users/mirror.yaml | 3 + .../workloads/rbd-mirror-fsx-workunit.yaml | 32 +++++++ qa/suites/rbd/mirror/cluster | 1 + qa/suites/rbd/mirror/cluster/2-node.yaml | 31 ------- qa/tasks/rbd_fsx.py | 12 ++- qa/tasks/rbd_mirror_thrash.py | 25 +++--- qa/workunits/rbd/rbd_mirror.sh | 6 +- qa/workunits/rbd/rbd_mirror_fsx_compare.sh | 28 +++++++ qa/workunits/rbd/rbd_mirror_fsx_prepare.sh | 10 +++ qa/workunits/rbd/rbd_mirror_ha.sh | 6 +- qa/workunits/rbd/rbd_mirror_helpers.sh | 84 +++++++++++++------ qa/workunits/rbd/rbd_mirror_stress.sh | 6 +- 13 files changed, 173 insertions(+), 77 deletions(-) create mode 100644 qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml create mode 120000 qa/suites/rbd/mirror/cluster delete mode 100644 qa/suites/rbd/mirror/cluster/2-node.yaml create mode 100755 qa/workunits/rbd/rbd_mirror_fsx_compare.sh create mode 100755 qa/workunits/rbd/rbd_mirror_fsx_prepare.sh diff --git a/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml b/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml index 700c029dfafc2..74f9fb3c466f3 100644 --- a/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml +++ b/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml @@ -18,8 +18,14 @@ roles: - cluster1.client.mirror.1 - cluster1.client.mirror.2 - cluster1.client.mirror.3 + - cluster1.client.mirror.4 + - cluster1.client.mirror.5 + - cluster1.client.mirror.6 - cluster2.client.mirror - cluster2.client.mirror.0 - cluster2.client.mirror.1 - cluster2.client.mirror.2 - cluster2.client.mirror.3 + - cluster2.client.mirror.4 + - cluster2.client.mirror.5 + - cluster2.client.mirror.6 diff --git a/qa/suites/rbd/mirror-thrash/users/mirror.yaml b/qa/suites/rbd/mirror-thrash/users/mirror.yaml index 4915122479746..9466fa1f7d397 100644 --- a/qa/suites/rbd/mirror-thrash/users/mirror.yaml +++ b/qa/suites/rbd/mirror-thrash/users/mirror.yaml @@ -3,6 +3,9 @@ meta: overrides: ceph: conf: + client: + rbd default features: 125 + debug rbd_mirror: 15 # override to make these names predictable client.mirror.0: admin socket: /var/run/ceph/rbd-mirror.$cluster-$name.asok diff --git a/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml b/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml new file mode 100644 index 0000000000000..6b7dd40ceb57c --- /dev/null +++ b/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml @@ -0,0 +1,32 @@ +meta: +- desc: run multiple FSX workloads to simulate cluster load and then verify + that the images were replicated +tasks: +- workunit: + clients: + cluster1.client.mirror: [rbd/rbd_mirror_fsx_prepare.sh] + env: + # override workunit setting of CEPH_ARGS='--cluster' + CEPH_ARGS: '' + RBD_MIRROR_NOCLEANUP: '1' + RBD_MIRROR_USE_EXISTING_CLUSTER: '1' + RBD_MIRROR_USE_RBD_MIRROR: '1' +- rbd_fsx: + clients: + - cluster1.client.mirror.0 + - cluster1.client.mirror.1 + - cluster1.client.mirror.2 + - cluster1.client.mirror.3 + - cluster1.client.mirror.4 + - cluster1.client.mirror.5 + ops: 20000 + keep_images: true + pool_name: mirror +- workunit: + clients: + cluster1.client.mirror: [rbd/rbd_mirror_fsx_compare.sh] + env: + # override workunit setting of CEPH_ARGS='--cluster' + CEPH_ARGS: '' + RBD_MIRROR_USE_EXISTING_CLUSTER: '1' + RBD_MIRROR_USE_RBD_MIRROR: '1' diff --git a/qa/suites/rbd/mirror/cluster b/qa/suites/rbd/mirror/cluster new file mode 120000 index 0000000000000..3fc87a150ecb0 --- /dev/null +++ b/qa/suites/rbd/mirror/cluster @@ -0,0 +1 @@ +../mirror-thrash/cluster \ No newline at end of file diff --git a/qa/suites/rbd/mirror/cluster/2-node.yaml b/qa/suites/rbd/mirror/cluster/2-node.yaml deleted file mode 100644 index 74f9fb3c466f3..0000000000000 --- a/qa/suites/rbd/mirror/cluster/2-node.yaml +++ /dev/null @@ -1,31 +0,0 @@ -meta: -- desc: 2 ceph clusters with 1 mon and 3 osds each -roles: -- - cluster1.mon.a - - cluster1.mgr.x - - cluster2.mgr.x - - cluster1.osd.0 - - cluster1.osd.1 - - cluster1.osd.2 - - cluster1.client.0 - - cluster2.client.0 -- - cluster2.mon.a - - cluster2.osd.0 - - cluster2.osd.1 - - cluster2.osd.2 - - cluster1.client.mirror - - cluster1.client.mirror.0 - - cluster1.client.mirror.1 - - cluster1.client.mirror.2 - - cluster1.client.mirror.3 - - cluster1.client.mirror.4 - - cluster1.client.mirror.5 - - cluster1.client.mirror.6 - - cluster2.client.mirror - - cluster2.client.mirror.0 - - cluster2.client.mirror.1 - - cluster2.client.mirror.2 - - cluster2.client.mirror.3 - - cluster2.client.mirror.4 - - cluster2.client.mirror.5 - - cluster2.client.mirror.6 diff --git a/qa/tasks/rbd_fsx.py b/qa/tasks/rbd_fsx.py index 7dae4478a4655..d32475ecd03c9 100644 --- a/qa/tasks/rbd_fsx.py +++ b/qa/tasks/rbd_fsx.py @@ -4,6 +4,7 @@ Run fsx on an rbd image import contextlib import logging +from teuthology.orchestra import run from teuthology.parallel import parallel from teuthology import misc as teuthology @@ -68,8 +69,15 @@ def _run_one_client(ctx, config, role): config.get('valgrind') ) + cluster_name, type_, client_id = teuthology.split_role(role) + if type_ != 'client': + msg = 'client role ({0}) must be a client'.format(role) + raise ConfigError(msg) + args.extend([ 'ceph_test_librbd_fsx', + '--cluster', cluster_name, + '--id', client_id, '-d', # debug output for all operations '-W', '-R', # mmap doesn't work with rbd '-p', str(config.get('progress_interval', 100)), # show progress @@ -96,8 +104,10 @@ def _run_one_client(ctx, config, role): args.append('-g') # -g deep copy instead of clone if config.get('journal_replay', False): args.append('-j') # -j replay all IO events from journal + if config.get('keep_images', False): + args.append('-k') # -k keep images on success args.extend([ - 'pool_{pool}'.format(pool=role), + config.get('pool_name', 'pool_{pool}'.format(pool=role)), 'image_{image}'.format(image=role), ]) diff --git a/qa/tasks/rbd_mirror_thrash.py b/qa/tasks/rbd_mirror_thrash.py index 5c92f50714a8d..6e35c3140147d 100644 --- a/qa/tasks/rbd_mirror_thrash.py +++ b/qa/tasks/rbd_mirror_thrash.py @@ -34,7 +34,10 @@ class RBDMirrorThrasher(Greenlet): max_thrash: [default: 1] the maximum number of active rbd-mirror daemons per cluster will be thrashed at any given time. - max_thrash_delay: [default: 30] maximum number of seconds to delay before + min_thrash_delay: [default: 60] minimum number of seconds to delay before + thrashing again. + + max_thrash_delay: [default: 120] maximum number of seconds to delay before thrashing again. max_revive_delay: [default: 10] maximum number of seconds to delay before @@ -71,7 +74,8 @@ class RBDMirrorThrasher(Greenlet): self.randomize = bool(self.config.get('randomize', True)) self.max_thrash = int(self.config.get('max_thrash', 1)) - self.max_thrash_delay = float(self.config.get('thrash_delay', 60.0)) + self.min_thrash_delay = float(self.config.get('min_thrash_delay', 60.0)) + self.max_thrash_delay = float(self.config.get('max_thrash_delay', 120.0)) self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) def _run(self): @@ -101,7 +105,7 @@ class RBDMirrorThrasher(Greenlet): while not self.stopping.is_set(): delay = self.max_thrash_delay if self.randomize: - delay = random.randrange(0.0, self.max_thrash_delay) + delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay) if delay > 0.0: self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) @@ -114,14 +118,10 @@ class RBDMirrorThrasher(Greenlet): weight = 1.0 / len(self.daemons) count = 0 for daemon in self.daemons: - # if we've reached max_thrash, we're done - count = count + 1 - if count > self.max_thrash: - break - - skip = random.randrange(0.0, 1.0) + skip = random.uniform(0.0, 1.0) if weight <= skip: - self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight)) + self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format( + label=daemon.id_, skip=skip, weight=weight)) continue self.log('kill {label}'.format(label=daemon.id_)) @@ -129,6 +129,11 @@ class RBDMirrorThrasher(Greenlet): killed_daemons.append(daemon) stats['kill'] += 1 + # if we've reached max_thrash, we're done + count += 1 + if count >= self.max_thrash: + break + if killed_daemons: # wait for a while before restarting diff --git a/qa/workunits/rbd/rbd_mirror.sh b/qa/workunits/rbd/rbd_mirror.sh index ee8044ef239a2..655453f64cff4 100755 --- a/qa/workunits/rbd/rbd_mirror.sh +++ b/qa/workunits/rbd/rbd_mirror.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -ex # # rbd_mirror.sh - test rbd-mirror daemon # @@ -9,6 +9,8 @@ . $(dirname $0)/rbd_mirror_helpers.sh +setup + testlog "TEST: add image and test replay" start_mirrors ${CLUSTER1} image=test @@ -436,5 +438,3 @@ if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries" CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries" fi - -echo OK diff --git a/qa/workunits/rbd/rbd_mirror_fsx_compare.sh b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh new file mode 100755 index 0000000000000..71d9a440097fb --- /dev/null +++ b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh @@ -0,0 +1,28 @@ +#!/bin/sh -ex +# +# rbd_mirror_fsx_compare.sh - test rbd-mirror daemon under FSX workload +# +# The script is used to compare FSX-generated images between two clusters. +# + +. $(dirname $0)/rbd_mirror_helpers.sh + +trap cleanup INT TERM EXIT + +setup_tempdir + +testlog "TEST: snapshot all pool images" +snap_id=`uuidgen` +for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do + create_snapshot ${CLUSTER1} ${POOL} ${image} ${snap_id} +done + +testlog "TEST: wait for snapshots" +for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do + wait_for_snap_present ${CLUSTER2} ${POOL} ${image} ${snap_id} +done + +testlog "TEST: compare image snapshots" +for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do + compare_image_snapshots ${POOL} ${image} +done diff --git a/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh new file mode 100755 index 0000000000000..d988987ba42a8 --- /dev/null +++ b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh @@ -0,0 +1,10 @@ +#!/bin/sh -ex +# +# rbd_mirror_fsx_prepare.sh - test rbd-mirror daemon under FSX workload +# +# The script is used to compare FSX-generated images between two clusters. +# + +. $(dirname $0)/rbd_mirror_helpers.sh + +setup diff --git a/qa/workunits/rbd/rbd_mirror_ha.sh b/qa/workunits/rbd/rbd_mirror_ha.sh index fc08c1dbe6017..9ee0cb98f72fa 100755 --- a/qa/workunits/rbd/rbd_mirror_ha.sh +++ b/qa/workunits/rbd/rbd_mirror_ha.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -ex # # rbd_mirror_ha.sh - test rbd-mirror daemons in HA mode # @@ -7,6 +7,8 @@ RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-7} . $(dirname $0)/rbd_mirror_helpers.sh +setup + is_leader() { local instance=$1 @@ -205,5 +207,3 @@ for i in 0 1 2 3 4 5; do done stop_mirror ${CLUSTER1}:${LEADER} - -echo OK diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh index a81d8989c05e3..5a58bae111891 100755 --- a/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -1,4 +1,4 @@ -#!/bin/sh -x +#!/bin/sh # # rbd_mirror_helpers.sh - shared rbd-mirror daemon helper functions # @@ -245,11 +245,8 @@ setup_pools() rbd --cluster ${cluster} mirror pool peer add ${PARENT_POOL} ${remote_cluster} } -setup() +setup_tempdir() { - local c - trap cleanup INT TERM EXIT - if [ -n "${RBD_MIRROR_TEMDIR}" ]; then test -d "${RBD_MIRROR_TEMDIR}" || mkdir "${RBD_MIRROR_TEMDIR}" @@ -258,7 +255,14 @@ setup() else TEMPDIR=`mktemp -d` fi +} + +setup() +{ + local c + trap 'cleanup $?' INT TERM EXIT + setup_tempdir if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then setup_cluster "${CLUSTER1}" setup_cluster "${CLUSTER2}" @@ -273,27 +277,41 @@ setup() cleanup() { - test -n "${RBD_MIRROR_NOCLEANUP}" && return - local cluster instance + local error_code=$1 set +e - for cluster in "${CLUSTER1}" "${CLUSTER2}"; do - stop_mirrors "${cluster}" - done + if [ "${error_code}" -ne 0 ]; then + status + fi - if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then - cd ${CEPH_ROOT} - CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1} - CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2} + if [ -z "${RBD_MIRROR_NOCLEANUP}" ]; then + local cluster instance + + for cluster in "${CLUSTER1}" "${CLUSTER2}"; do + stop_mirrors "${cluster}" + done + + if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then + cd ${CEPH_ROOT} + CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1} + CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2} + else + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it + CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it + fi + test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" || rm -Rf ${TEMPDIR} + fi + + if [ "${error_code}" -eq 0 ]; then + echo "OK" else - CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it - CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it - CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it - CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it + echo "FAIL" fi - test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" || - rm -Rf ${TEMPDIR} + + exit ${error_code} } start_mirror() @@ -507,10 +525,11 @@ test_image_replay_state() local pool=$2 local image=$3 local test_state=$4 + local status_result local current_state=stopped - admin_daemons "${cluster}" rbd mirror status ${pool}/${image} | - grep -i 'state.*Replaying' && current_state=started + status_result=$(admin_daemons "${cluster}" rbd mirror status ${pool}/${image} | grep -i 'state') || return 1 + echo "${status_result}" | grep -i 'Replaying' && current_state=started test "${test_state}" = "${current_state}" } @@ -858,6 +877,23 @@ compare_images() rm -f ${rmt_export} ${loc_export} } +compare_image_snapshots() +{ + local pool=$1 + local image=$2 + + local rmt_export=${TEMPDIR}/${CLUSTER2}-${pool}-${image}.export + local loc_export=${TEMPDIR}/${CLUSTER1}-${pool}-${image}.export + + for snap_name in $(rbd --cluster ${CLUSTER1} -p ${pool} snap list ${image}); do + rm -f ${rmt_export} ${loc_export} + rbd --cluster ${CLUSTER2} -p ${pool} export ${image}@${snap_name} ${rmt_export} + rbd --cluster ${CLUSTER1} -p ${pool} export ${image}@${snap_name} ${loc_export} + cmp ${rmt_export} ${loc_export} + done + rm -f ${rmt_export} ${loc_export} +} + demote_image() { local cluster=$1 @@ -994,7 +1030,3 @@ then $@ exit $? fi - -set -xe - -setup diff --git a/qa/workunits/rbd/rbd_mirror_stress.sh b/qa/workunits/rbd/rbd_mirror_stress.sh index f21984eeee448..a1d7d03470a61 100755 --- a/qa/workunits/rbd/rbd_mirror_stress.sh +++ b/qa/workunits/rbd/rbd_mirror_stress.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/sh -ex # # rbd_mirror_stress.sh - stress test rbd-mirror daemon # @@ -13,6 +13,8 @@ export LOCKDEP=0 . $(dirname $0)/rbd_mirror_helpers.sh +setup + create_snap() { local cluster=$1 @@ -182,5 +184,3 @@ do purge_snapshots ${CLUSTER2} ${POOL} ${image} remove_image_retry ${CLUSTER2} ${POOL} ${image} done - -echo OK -- 2.39.5