]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: Add tests to validate syncing of images using rbd-mirror
authorRamana Raja <rraja@redhat.com>
Thu, 25 May 2023 16:48:12 +0000 (16:48 +0000)
committerRamana Raja <rraja@redhat.com>
Thu, 14 Mar 2024 13:47:48 +0000 (09:47 -0400)
Introduce functional tests to validate that the images under
workloads are correctly mirrored between two clusters using snapshot
based mirroring.

Run workload on a primary image using a krbd or nbd client. Take
mirror snapshots of the image under workload. Unmount the mapped image
and calculate its MD5 checksum before demoting it. After demotion,
wait for the mirror status of the image to be 'up+unknown' in both
the clusters. This is to make sure that the non-primary image in the
other cluster is ready to be promoted. Now promote the non-primary
image in the other cluster. Map the promoted image and calculate its
MD5 checksum. Verify that the checksums of the demoted and promoted
images in the two clusters are the same.

The above test is run as part of two different workunits:
 - a workunit that validates the syncing of multiple mirrored images
   with workloads running on them
 - another workunit that validates the syncing of a single mirrored
   image with workload running on it and the image is set as primary
   alternatively between the two clusters, as it happens during
   failover and failback scenarios.

Fixes: https://tracker.ceph.com/issues/61617
Signed-off-by: Ramana Raja <rraja@redhat.com>
Co-authored-by: Ilya Dryomov <idryomov@redhat.com>
Co-authored-by: Christopher Hoffman <choffman@redhat.com>
(cherry picked from commit b7aae5c3c5a1dd24c4cb7ceb499292af00bae680)

Cherry-pick notes:
- In qa/workunits/rbd/compare_mirror_images.sh, replace
  `wait_for_replaying_status_in_pool_dir` with `wait_for_status_in_pool_dir`
  Commit 3fd8a03 that added `wait_for_replaying_status_in_pool_dir`
  not backported

qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml [new file with mode: 0644]
qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml [new file with mode: 0644]
qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml [new file with mode: 0644]
qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml [new file with mode: 0644]
qa/workunits/rbd/compare_mirror_image_alternate_primary.sh [new file with mode: 0755]
qa/workunits/rbd/compare_mirror_images.sh [new file with mode: 0755]

diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml
new file mode 100644 (file)
index 0000000..771400d
--- /dev/null
@@ -0,0 +1,13 @@
+overrides:
+  install:
+    ceph:
+      extra_system_packages:
+        - pv
+tasks:
+- workunit:
+    clients:
+      cluster1.client.mirror:
+        - rbd/compare_mirror_image_alternate_primary.sh
+    env:
+      RBD_DEVICE_TYPE: 'krbd'
+    timeout: 3h
diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml
new file mode 100644 (file)
index 0000000..e87d0e8
--- /dev/null
@@ -0,0 +1,15 @@
+overrides:
+  install:
+    ceph:
+      extra_packages:
+        - rbd-nbd
+      extra_system_packages:
+        - pv
+tasks:
+- workunit:
+    clients:
+      cluster1.client.mirror:
+        - rbd/compare_mirror_image_alternate_primary.sh
+    env:
+      RBD_DEVICE_TYPE: 'nbd'
+    timeout: 3h
diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml
new file mode 100644 (file)
index 0000000..fc16198
--- /dev/null
@@ -0,0 +1,13 @@
+overrides:
+  install:
+    ceph:
+      extra_system_packages:
+        - pv
+tasks:
+- workunit:
+    clients:
+      cluster1.client.mirror:
+        - rbd/compare_mirror_images.sh
+    env:
+      RBD_DEVICE_TYPE: 'krbd'
+    timeout: 3h
diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml
new file mode 100644 (file)
index 0000000..ed02ed2
--- /dev/null
@@ -0,0 +1,15 @@
+overrides:
+  install:
+    ceph:
+      extra_packages:
+        - rbd-nbd
+      extra_system_packages:
+        - pv
+tasks:
+- workunit:
+    clients:
+      cluster1.client.mirror:
+        - rbd/compare_mirror_images.sh
+    env:
+      RBD_DEVICE_TYPE: 'nbd'
+    timeout: 3h
diff --git a/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh b/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh
new file mode 100755 (executable)
index 0000000..338f43f
--- /dev/null
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+set -ex
+
+IMAGE=image-alternate-primary
+MIRROR_IMAGE_MODE=snapshot
+MIRROR_POOL_MODE=image
+MOUNT=test-alternate-primary
+RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
+RBD_MIRROR_INSTANCES=1
+RBD_MIRROR_MODE=snapshot
+RBD_MIRROR_USE_EXISTING_CLUSTER=1
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+take_mirror_snapshots() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for i in {1..30}; do
+    mirror_image_snapshot $cluster $pool $image
+    sleep 3
+  done
+}
+
+slow_untar_workload() {
+  local mountpt=$1
+
+  cp linux-5.4.tar.gz $mountpt
+  # run workload that updates the data and metadata of multiple files on disk.
+  # rate limit the workload such that the mirror snapshots can be taken as the
+  # contents of the image are progressively changed by the workload.
+  local ret=0
+  timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
+    | pv -L 256K | tar xf - -C $mountpt" || ret=$?
+  if ((ret != 124)); then
+    echo "Workload completed prematurely"
+    return 1
+  fi
+}
+
+setup
+
+start_mirrors ${CLUSTER1}
+start_mirrors ${CLUSTER2}
+
+# initial setup
+create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMAGE} \
+  ${RBD_MIRROR_MODE} 10G
+
+if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+  DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
+           -o try-netlink ${POOL}/${IMAGE})
+elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+  DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
+           ${POOL}/${IMAGE})
+else
+  echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
+  exit 1
+fi
+sudo mkfs.ext4 ${DEV}
+mkdir ${MOUNT}
+
+wget https://download.ceph.com/qa/linux-5.4.tar.gz
+
+for i in {1..25}; do
+  # create mirror snapshots every few seconds under I/O
+  sudo mount ${DEV} ${MOUNT}
+  sudo chown $(whoami) ${MOUNT}
+  rm -rf ${MOUNT}/*
+  take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMAGE} &
+  SNAP_PID=$!
+  slow_untar_workload ${MOUNT}
+  wait $SNAP_PID
+  sudo umount ${MOUNT}
+
+  # calculate hash before demotion of primary image
+  DEMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} ${DEV}
+
+  demote_image ${CLUSTER1} ${POOL} ${IMAGE}
+  wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMAGE} 'up+unknown'
+  wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${IMAGE} 'up+unknown'
+  promote_image ${CLUSTER2} ${POOL} ${IMAGE}
+
+  # calculate hash after promotion of secondary image
+  if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+    DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
+             -o try-netlink ${POOL}/${IMAGE})
+  elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+    DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${IMAGE})
+  fi
+  PROMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
+
+  if [[ "${DEMOTE_MD5}" != "${PROMOTE_MD5}" ]]; then
+    echo "Mismatch at iteration ${i}: ${DEMOTE_MD5} != ${PROMOTE_MD5}"
+    exit 1
+  fi
+
+  TEMP=${CLUSTER1}
+  CLUSTER1=${CLUSTER2}
+  CLUSTER2=${TEMP}
+done
+
+echo OK
diff --git a/qa/workunits/rbd/compare_mirror_images.sh b/qa/workunits/rbd/compare_mirror_images.sh
new file mode 100755 (executable)
index 0000000..3f45e20
--- /dev/null
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+set -ex
+
+IMG_PREFIX=image-primary
+MIRROR_IMAGE_MODE=snapshot
+MIRROR_POOL_MODE=image
+MNTPT_PREFIX=test-primary
+RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
+RBD_MIRROR_INSTANCES=1
+RBD_MIRROR_MODE=snapshot
+RBD_MIRROR_USE_EXISTING_CLUSTER=1
+
+. $(dirname $0)/rbd_mirror_helpers.sh
+
+take_mirror_snapshots() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for i in {1..30}; do
+    mirror_image_snapshot $cluster $pool $image
+    sleep 3
+  done
+}
+
+slow_untar_workload() {
+  local mountpt=$1
+
+  cp linux-5.4.tar.gz $mountpt
+  # run workload that updates the data and metadata of multiple files on disk.
+  # rate limit the workload such that the mirror snapshots can be taken as the
+  # contents of the image are progressively changed by the workload.
+  local ret=0
+  timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
+    | pv -L 256K | tar xf - -C $mountpt" || ret=$?
+  if ((ret != 124)); then
+    echo "Workload completed prematurely"
+    return 1
+  fi
+}
+
+wait_for_image_removal() {
+  local cluster=$1
+  local pool=$2
+  local image=$3
+
+  for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+    if ! rbd --cluster $cluster ls $pool | grep -wq $image; then
+      return 0
+    fi
+    sleep $s
+  done
+
+  echo "image ${pool}/${image} not removed from cluster ${cluster}"
+  return 1
+}
+
+compare_demoted_promoted_image() {
+  local dev=${DEVS[$1-1]}
+  local img=${IMG_PREFIX}$1
+  local mntpt=${MNTPT_PREFIX}$1
+  local demote_md5 promote_md5
+
+  sudo umount ${mntpt}
+
+  # calculate hash before demotion of primary image
+  demote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} \
+      ${POOL}/${img}
+
+  demote_image ${CLUSTER1} ${POOL} ${img}
+  wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${img} 'up+unknown'
+  wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${img} 'up+unknown'
+  promote_image ${CLUSTER2} ${POOL} ${img}
+
+  # calculate hash after promotion of secondary image
+  if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+    dev=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
+             -o try-netlink ${POOL}/${img})
+  elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+    dev=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${img})
+  fi
+  promote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
+  sudo rbd --cluster ${CLUSTER2} device unmap -t ${RBD_DEVICE_TYPE} ${dev}
+
+  if [[ "${demote_md5}" != "${promote_md5}" ]]; then
+    echo "Mismatch for image ${POOL}/${img}: ${demote_md5} != ${promote_md5}"
+    return 1
+  fi
+}
+
+setup
+
+start_mirrors ${CLUSTER1}
+start_mirrors ${CLUSTER2}
+
+wget https://download.ceph.com/qa/linux-5.4.tar.gz
+
+for i in {1..10}; do
+  DEVS=()
+  SNAP_PIDS=()
+  COMPARE_PIDS=()
+  WORKLOAD_PIDS=()
+  RET=0
+  for j in {1..10}; do
+    IMG=${IMG_PREFIX}${j}
+    MNTPT=${MNTPT_PREFIX}${j}
+    create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMG} \
+      ${RBD_MIRROR_MODE} 10G
+    if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
+      DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
+             -o try-netlink ${POOL}/${IMG})
+    elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
+      DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
+             ${POOL}/${IMG})
+    else
+      echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
+      exit 1
+    fi
+    DEVS+=($DEV)
+    sudo mkfs.ext4 ${DEV}
+    mkdir ${MNTPT}
+    sudo mount ${DEV} ${MNTPT}
+    sudo chown $(whoami) ${MNTPT}
+    # create mirror snapshots under I/O every few seconds
+    take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMG} &
+    SNAP_PIDS+=($!)
+    slow_untar_workload ${MNTPT} &
+    WORKLOAD_PIDS+=($!)
+  done
+  for pid in ${SNAP_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "take_mirror_snapshots failed"
+    exit 1
+  fi
+  for pid in ${WORKLOAD_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "slow_untar_workload failed"
+    exit 1
+  fi
+
+  for j in {1..10}; do
+    compare_demoted_promoted_image $j &
+    COMPARE_PIDS+=($!)
+  done
+  for pid in ${COMPARE_PIDS[@]}; do
+    wait $pid || RET=$?
+  done
+  if ((RET != 0)); then
+    echo "compare_demoted_promoted_image failed"
+    exit 1
+  fi
+
+  for j in {1..10}; do
+    IMG=${IMG_PREFIX}${j}
+    # Allow for removal of non-primary image by checking that mirroring
+    # image status is "up+replaying"
+    wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMG} 'up+replaying'
+    remove_image ${CLUSTER2} ${POOL} ${IMG}
+    wait_for_image_removal ${CLUSTER1} ${POOL} ${IMG}
+    rm -rf ${MNTPT_PREFIX}${j}
+  done
+done
+
+echo OK