qa/suites/rbd: rbd-mirror fsx stress test

author Jason Dillaman <dillaman@redhat.com>

Tue, 24 Apr 2018 13:00:18 +0000 (09:00 -0400)

committer Jason Dillaman <dillaman@redhat.com>

Fri, 27 Apr 2018 12:34:42 +0000 (08:34 -0400)
author Jason Dillaman <dillaman@redhat.com>
Tue, 24 Apr 2018 13:00:18 +0000 (09:00 -0400)
committer Jason Dillaman <dillaman@redhat.com>
Fri, 27 Apr 2018 12:34:42 +0000 (08:34 -0400)
diff --git a/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml b/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml

index 700c029dfafc221ba325092cbe910d36771993b6..74f9fb3c466f36014ed408c6fcc939fe4e9e9e17 100644 (file)
--- a/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml
+++ b/qa/suites/rbd/mirror-thrash/cluster/2-node.yaml
@@ -18,8 +18,14 @@ roles:
    - cluster1.client.mirror.1
    - cluster1.client.mirror.2
    - cluster1.client.mirror.3
+  - cluster1.client.mirror.4
+  - cluster1.client.mirror.5
+  - cluster1.client.mirror.6
    - cluster2.client.mirror
    - cluster2.client.mirror.0
    - cluster2.client.mirror.1
    - cluster2.client.mirror.2
    - cluster2.client.mirror.3
+  - cluster2.client.mirror.4
+  - cluster2.client.mirror.5
+  - cluster2.client.mirror.6
diff --git a/qa/suites/rbd/mirror-thrash/users/mirror.yaml b/qa/suites/rbd/mirror-thrash/users/mirror.yaml

index 491512247974659d8ab516ee607d08cef6280976..9466fa1f7d397ef908f84edc4d9283b55418f2c8 100644 (file)
--- a/qa/suites/rbd/mirror-thrash/users/mirror.yaml
+++ b/qa/suites/rbd/mirror-thrash/users/mirror.yaml
@@ -3,6 +3,9 @@ meta:
  overrides:
    ceph:
      conf:
+      client:
+        rbd default features: 125
+        debug rbd_mirror: 15
        # override to make these names predictable
        client.mirror.0:
          admin socket: /var/run/ceph/rbd-mirror.$cluster-$name.asok
diff --git a/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml b/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml

new file mode 100644 (file)

index 0000000..6b7dd40
--- /dev/null
+++ b/qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml
@@ -0,0 +1,32 @@
+meta:
+- desc: run multiple FSX workloads to simulate cluster load and then verify
+        that the images were replicated
+tasks:
+- workunit:
+    clients:
+      cluster1.client.mirror: [rbd/rbd_mirror_fsx_prepare.sh]
+    env:
+      # override workunit setting of CEPH_ARGS='--cluster'
+      CEPH_ARGS: ''
+      RBD_MIRROR_NOCLEANUP: '1'
+      RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
+      RBD_MIRROR_USE_RBD_MIRROR: '1'
+- rbd_fsx:
+    clients:
+      - cluster1.client.mirror.0
+      - cluster1.client.mirror.1
+      - cluster1.client.mirror.2
+      - cluster1.client.mirror.3
+      - cluster1.client.mirror.4
+      - cluster1.client.mirror.5
+    ops: 20000
+    keep_images: true
+    pool_name: mirror
+- workunit:
+    clients:
+      cluster1.client.mirror: [rbd/rbd_mirror_fsx_compare.sh]
+    env:
+      # override workunit setting of CEPH_ARGS='--cluster'
+      CEPH_ARGS: ''
+      RBD_MIRROR_USE_EXISTING_CLUSTER: '1'
+      RBD_MIRROR_USE_RBD_MIRROR: '1'
diff --git a/qa/suites/rbd/mirror/cluster b/qa/suites/rbd/mirror/cluster

new file mode 120000 (symlink)

index 0000000..3fc87a1
--- /dev/null
+++ b/qa/suites/rbd/mirror/cluster
@@ -0,0 +1 @@
+../mirror-thrash/cluster
+\ No newline at end of file
diff --git a/qa/suites/rbd/mirror/cluster/2-node.yaml b/qa/suites/rbd/mirror/cluster/2-node.yaml

deleted file mode 100644 (file)

index 74f9fb3..0000000
--- a/qa/suites/rbd/mirror/cluster/2-node.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-meta:
-- desc: 2 ceph clusters with 1 mon and 3 osds each
-roles:
-- - cluster1.mon.a
-  - cluster1.mgr.x
-  - cluster2.mgr.x
-  - cluster1.osd.0
-  - cluster1.osd.1
-  - cluster1.osd.2
-  - cluster1.client.0
-  - cluster2.client.0
-- - cluster2.mon.a
-  - cluster2.osd.0
-  - cluster2.osd.1
-  - cluster2.osd.2
-  - cluster1.client.mirror
-  - cluster1.client.mirror.0
-  - cluster1.client.mirror.1
-  - cluster1.client.mirror.2
-  - cluster1.client.mirror.3
-  - cluster1.client.mirror.4
-  - cluster1.client.mirror.5
-  - cluster1.client.mirror.6
-  - cluster2.client.mirror
-  - cluster2.client.mirror.0
-  - cluster2.client.mirror.1
-  - cluster2.client.mirror.2
-  - cluster2.client.mirror.3
-  - cluster2.client.mirror.4
-  - cluster2.client.mirror.5
-  - cluster2.client.mirror.6
diff --git a/qa/tasks/rbd_fsx.py b/qa/tasks/rbd_fsx.py

index 7dae4478a465594b33837100ee8920ce60bf8199..d32475ecd03c9c710476b20d54f72aa9c388212f 100644 (file)
--- a/qa/tasks/rbd_fsx.py
+++ b/qa/tasks/rbd_fsx.py
@@ -4,6 +4,7 @@ Run fsx on an rbd image
  import contextlib
  import logging
  
+from teuthology.orchestra import run
  from teuthology.parallel import parallel
  from teuthology import misc as teuthology
  
@@ -68,8 +69,15 @@ def _run_one_client(ctx, config, role):
              config.get('valgrind')
          )
  
+    cluster_name, type_, client_id = teuthology.split_role(role)
+    if type_ != 'client':
+        msg = 'client role ({0}) must be a client'.format(role)
+        raise ConfigError(msg)
+
      args.extend([
          'ceph_test_librbd_fsx',
+        '--cluster', cluster_name,
+        '--id', client_id,
          '-d', # debug output for all operations
          '-W', '-R', # mmap doesn't work with rbd
          '-p', str(config.get('progress_interval', 100)), # show progress
@@ -96,8 +104,10 @@ def _run_one_client(ctx, config, role):
          args.append('-g') # -g deep copy instead of clone
      if config.get('journal_replay', False):
          args.append('-j') # -j replay all IO events from journal
+    if config.get('keep_images', False):
+        args.append('-k') # -k keep images on success
      args.extend([
-        'pool_{pool}'.format(pool=role),
+        config.get('pool_name', 'pool_{pool}'.format(pool=role)),
          'image_{image}'.format(image=role),
      ])
  
diff --git a/qa/tasks/rbd_mirror_thrash.py b/qa/tasks/rbd_mirror_thrash.py

index 5c92f50714a8d91929bc321c8abfa297ee2ee387..6e35c3140147d5e1ad1057ba515176b188e831c2 100644 (file)
--- a/qa/tasks/rbd_mirror_thrash.py
+++ b/qa/tasks/rbd_mirror_thrash.py
@@ -34,7 +34,10 @@ class RBDMirrorThrasher(Greenlet):
      max_thrash: [default: 1] the maximum number of active rbd-mirror daemons per
        cluster will be thrashed at any given time.
  
-    max_thrash_delay: [default: 30] maximum number of seconds to delay before
+    min_thrash_delay: [default: 60] minimum number of seconds to delay before
+      thrashing again.
+
+    max_thrash_delay: [default: 120] maximum number of seconds to delay before
        thrashing again.
  
      max_revive_delay: [default: 10] maximum number of seconds to delay before
@@ -71,7 +74,8 @@ class RBDMirrorThrasher(Greenlet):
  
          self.randomize = bool(self.config.get('randomize', True))
          self.max_thrash = int(self.config.get('max_thrash', 1))
-        self.max_thrash_delay = float(self.config.get('thrash_delay', 60.0))
+        self.min_thrash_delay = float(self.config.get('min_thrash_delay', 60.0))
+        self.max_thrash_delay = float(self.config.get('max_thrash_delay', 120.0))
          self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
  
      def _run(self):
@@ -101,7 +105,7 @@ class RBDMirrorThrasher(Greenlet):
          while not self.stopping.is_set():
              delay = self.max_thrash_delay
              if self.randomize:
-                delay = random.randrange(0.0, self.max_thrash_delay)
+                delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay)
  
              if delay > 0.0:
                  self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
@@ -114,14 +118,10 @@ class RBDMirrorThrasher(Greenlet):
              weight = 1.0 / len(self.daemons)
              count = 0
              for daemon in self.daemons:
-                # if we've reached max_thrash, we're done
-                count = count + 1
-                if count > self.max_thrash:
-                    break
-
-                skip = random.randrange(0.0, 1.0)
+                skip = random.uniform(0.0, 1.0)
                  if weight <= skip:
-                    self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight))
+                    self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
+                        label=daemon.id_, skip=skip, weight=weight))
                      continue
  
                  self.log('kill {label}'.format(label=daemon.id_))
@@ -129,6 +129,11 @@ class RBDMirrorThrasher(Greenlet):
                  killed_daemons.append(daemon)
                  stats['kill'] += 1
  
+                # if we've reached max_thrash, we're done
+                count += 1
+                if count >= self.max_thrash:
+                    break
+
              if killed_daemons:
                  # wait for a while before restarting
  
diff --git a/qa/workunits/rbd/rbd_mirror.sh b/qa/workunits/rbd/rbd_mirror.sh

index ee8044ef239a2709948c66224e3cec87f689f7cb..655453f64cff49973c50b4f9b2e7c002c1ea80bc 100755 (executable)
--- a/qa/workunits/rbd/rbd_mirror.sh
+++ b/qa/workunits/rbd/rbd_mirror.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -ex
  #
  # rbd_mirror.sh - test rbd-mirror daemon
  #
@@ -9,6 +9,8 @@
  
  . $(dirname $0)/rbd_mirror_helpers.sh
  
+setup
+
  testlog "TEST: add image and test replay"
  start_mirrors ${CLUSTER1}
  image=test
@@ -436,5 +438,3 @@ if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
    CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER1} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
    CEPH_ARGS='--id admin' ceph --cluster ${CLUSTER2} osd blacklist ls 2>&1 | grep -q "listed 0 entries"
  fi
-
-echo OK
diff --git a/qa/workunits/rbd/rbd_mirror_fsx_compare.sh b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh

new file mode 100755 (executable)

index 0000000..71d9a44
--- /dev/null
+++ b/qa/workunits/rbd/rbd_mirror_fsx_compare.sh
@@ -0,0 +1,28 @@
+#!/bin/sh -ex
+#
+# rbd_mirror_fsx_compare.sh - test rbd-mirror daemon under FSX workload
+#
+# The script is used to compare FSX-generated images between two clusters.
+#
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+trap cleanup INT TERM EXIT
+
+setup_tempdir
+
+testlog "TEST: snapshot all pool images"
+snap_id=`uuidgen`
+for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do
+    create_snapshot ${CLUSTER1} ${POOL} ${image} ${snap_id}
+done
+
+testlog "TEST: wait for snapshots"
+for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do
+    wait_for_snap_present ${CLUSTER2} ${POOL} ${image} ${snap_id}
+done
+
+testlog "TEST: compare image snapshots"
+for image in $(rbd --cluster ${CLUSTER1} --pool ${POOL} ls); do
+    compare_image_snapshots ${POOL} ${image}
+done
diff --git a/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh

new file mode 100755 (executable)

index 0000000..d988987
--- /dev/null
+++ b/qa/workunits/rbd/rbd_mirror_fsx_prepare.sh
@@ -0,0 +1,10 @@
+#!/bin/sh -ex
+#
+# rbd_mirror_fsx_prepare.sh - test rbd-mirror daemon under FSX workload
+#
+# The script is used to compare FSX-generated images between two clusters.
+#
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+setup
diff --git a/qa/workunits/rbd/rbd_mirror_ha.sh b/qa/workunits/rbd/rbd_mirror_ha.sh

index fc08c1dbe601766111b300eab250303c59266ab8..9ee0cb98f72fa073e52c971361ce6db5f60d44a0 100755 (executable)
--- a/qa/workunits/rbd/rbd_mirror_ha.sh
+++ b/qa/workunits/rbd/rbd_mirror_ha.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -ex
  #
  # rbd_mirror_ha.sh - test rbd-mirror daemons in HA mode
  #
@@ -7,6 +7,8 @@ RBD_MIRROR_INSTANCES=${RBD_MIRROR_INSTANCES:-7}
  
  . $(dirname $0)/rbd_mirror_helpers.sh
  
+setup
+
  is_leader()
  {
      local instance=$1
@@ -205,5 +207,3 @@ for i in 0 1 2 3 4 5; do
  done
  
  stop_mirror ${CLUSTER1}:${LEADER}
-
-echo OK
diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh

index a81d8989c05e35f65037c20586933b0fbdee7b72..5a58bae11189132ab511d8b380a44c82515e64c2 100755 (executable)
--- a/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -1,4 +1,4 @@
-#!/bin/sh -x
+#!/bin/sh
  #
  # rbd_mirror_helpers.sh - shared rbd-mirror daemon helper functions
  #
@@ -245,11 +245,8 @@ setup_pools()
      rbd --cluster ${cluster} mirror pool peer add ${PARENT_POOL} ${remote_cluster}
  }
  
-setup()
+setup_tempdir()
  {
-    local c
-    trap cleanup INT TERM EXIT
-
      if [ -n "${RBD_MIRROR_TEMDIR}" ]; then
         test -d "${RBD_MIRROR_TEMDIR}" ||
         mkdir "${RBD_MIRROR_TEMDIR}"
@@ -258,7 +255,14 @@ setup()
      else
         TEMPDIR=`mktemp -d`
      fi
+}
+
+setup()
+{
+    local c
+    trap 'cleanup $?' INT TERM EXIT
  
+    setup_tempdir
      if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
         setup_cluster "${CLUSTER1}"
         setup_cluster "${CLUSTER2}"
@@ -273,27 +277,41 @@ setup()
  
  cleanup()
  {
-    test  -n "${RBD_MIRROR_NOCLEANUP}" && return
-    local cluster instance
+    local error_code=$1
  
      set +e
  
-    for cluster in "${CLUSTER1}" "${CLUSTER2}"; do
-       stop_mirrors "${cluster}"
-    done
+    if [ "${error_code}" -ne 0 ]; then
+        status
+    fi
  
-    if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
-        cd ${CEPH_ROOT}
-        CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1}
-        CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2}
+    if [ -z "${RBD_MIRROR_NOCLEANUP}" ]; then
+        local cluster instance
+
+        for cluster in "${CLUSTER1}" "${CLUSTER2}"; do
+           stop_mirrors "${cluster}"
+        done
+
+        if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
+            cd ${CEPH_ROOT}
+            CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER1}
+            CEPH_ARGS='' ${CEPH_SRC}/mstop.sh ${CLUSTER2}
+        else
+            CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
+            CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
+            CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
+            CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
+        fi
+        test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" || rm -Rf ${TEMPDIR}
+    fi
+
+    if [ "${error_code}" -eq 0 ]; then
+        echo "OK"
      else
-        CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
-        CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
-        CEPH_ARGS='' ceph --cluster ${CLUSTER1} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
-        CEPH_ARGS='' ceph --cluster ${CLUSTER2} osd pool rm ${PARENT_POOL} ${PARENT_POOL} --yes-i-really-really-mean-it
+        echo "FAIL"
      fi
-    test "${RBD_MIRROR_TEMDIR}" = "${TEMPDIR}" ||
-    rm -Rf ${TEMPDIR}
+
+    exit ${error_code}
  }
  
  start_mirror()
@@ -507,10 +525,11 @@ test_image_replay_state()
      local pool=$2
      local image=$3
      local test_state=$4
+    local status_result
      local current_state=stopped
  
-    admin_daemons "${cluster}" rbd mirror status ${pool}/${image} |
-       grep -i 'state.*Replaying' && current_state=started
+    status_result=$(admin_daemons "${cluster}" rbd mirror status ${pool}/${image} | grep -i 'state') || return 1
+    echo "${status_result}" | grep -i 'Replaying' && current_state=started
      test "${test_state}" = "${current_state}"
  }
  
@@ -858,6 +877,23 @@ compare_images()
      rm -f ${rmt_export} ${loc_export}
  }
  
+compare_image_snapshots()
+{
+    local pool=$1
+    local image=$2
+
+    local rmt_export=${TEMPDIR}/${CLUSTER2}-${pool}-${image}.export
+    local loc_export=${TEMPDIR}/${CLUSTER1}-${pool}-${image}.export
+
+    for snap_name in $(rbd --cluster ${CLUSTER1} -p ${pool} snap list ${image}); do
+        rm -f ${rmt_export} ${loc_export}
+        rbd --cluster ${CLUSTER2} -p ${pool} export ${image}@${snap_name} ${rmt_export}
+        rbd --cluster ${CLUSTER1} -p ${pool} export ${image}@${snap_name} ${loc_export}
+        cmp ${rmt_export} ${loc_export}
+    done
+    rm -f ${rmt_export} ${loc_export}
+}
+
  demote_image()
  {
      local cluster=$1
@@ -994,7 +1030,3 @@ then
      $@
      exit $?
  fi
-
-set -xe
-
-setup
diff --git a/qa/workunits/rbd/rbd_mirror_stress.sh b/qa/workunits/rbd/rbd_mirror_stress.sh

index f21984eeee448deed9f3cbb1732c344c11024b4b..a1d7d03470a61c858a22af319419a31ab3cfe6e1 100755 (executable)
--- a/qa/workunits/rbd/rbd_mirror_stress.sh
+++ b/qa/workunits/rbd/rbd_mirror_stress.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -ex
  #
  # rbd_mirror_stress.sh - stress test rbd-mirror daemon
  #
@@ -13,6 +13,8 @@ export LOCKDEP=0
  
  . $(dirname $0)/rbd_mirror_helpers.sh
  
+setup
+
  create_snap()
  {
      local cluster=$1
@@ -182,5 +184,3 @@ do
    purge_snapshots ${CLUSTER2} ${POOL} ${image}
    remove_image_retry ${CLUSTER2} ${POOL} ${image}
  done
-
-echo OK
author	Jason Dillaman <dillaman@redhat.com>
	Tue, 24 Apr 2018 13:00:18 +0000 (09:00 -0400)
committer	Jason Dillaman <dillaman@redhat.com>
	Fri, 27 Apr 2018 12:34:42 +0000 (08:34 -0400)
qa/suites/rbd/mirror-thrash/cluster/2-node.yaml		patch \| blob \| history
qa/suites/rbd/mirror-thrash/users/mirror.yaml		patch \| blob \| history
qa/suites/rbd/mirror-thrash/workloads/rbd-mirror-fsx-workunit.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rbd/mirror/cluster	[new symlink]	patch \| blob
qa/suites/rbd/mirror/cluster/2-node.yaml	[deleted file]	patch \| blob \| history
qa/tasks/rbd_fsx.py		patch \| blob \| history
qa/tasks/rbd_mirror_thrash.py		patch \| blob \| history
qa/workunits/rbd/rbd_mirror.sh		patch \| blob \| history
qa/workunits/rbd/rbd_mirror_fsx_compare.sh	[new file with mode: 0755]	patch \| blob
qa/workunits/rbd/rbd_mirror_fsx_prepare.sh	[new file with mode: 0755]	patch \| blob
qa/workunits/rbd/rbd_mirror_ha.sh		patch \| blob \| history
qa/workunits/rbd/rbd_mirror_helpers.sh		patch \| blob \| history
qa/workunits/rbd/rbd_mirror_stress.sh		patch \| blob \| history