From f13e6294e5ce8c023a84c32201e449e65cfe57b1 Mon Sep 17 00:00:00 2001 From: Ramana Raja Date: Thu, 25 May 2023 16:48:12 +0000 Subject: [PATCH] qa: Add tests to validate syncing of images using rbd-mirror Introduce functional tests to validate that the images under workloads are correctly mirrored between two clusters using snapshot based mirroring. Run workload on a primary image using a krbd or nbd client. Take mirror snapshots of the image under workload. Unmount the mapped image and calculate its MD5 checksum before demoting it. After demotion, wait for the mirror status of the image to be 'up+unknown' in both the clusters. This is to make sure that the non-primary image in the other cluster is ready to be promoted. Now promote the non-primary image in the other cluster. Map the promoted image and calculate its MD5 checksum. Verify that the checksums of the demoted and promoted images in the two clusters are the same. The above test is run as part of two different workunits: - a workunit that validates the syncing of multiple mirrored images with workloads running on them - another workunit that validates the syncing of a single mirrored image with workload running on it and the image is set as primary alternatively between the two clusters, as it happens during failover and failback scenarios. Fixes: https://tracker.ceph.com/issues/61617 Signed-off-by: Ramana Raja Co-authored-by: Ilya Dryomov Co-authored-by: Christopher Hoffman (cherry picked from commit b7aae5c3c5a1dd24c4cb7ceb499292af00bae680) Cherry-pick notes: - In qa/workunits/rbd/compare_mirror_images.sh, replace `wait_for_replaying_status_in_pool_dir` with `wait_for_status_in_pool_dir` Commit 3fd8a03 that added `wait_for_replaying_status_in_pool_dir` not backported --- ...e-mirror-image-alternate-primary-krbd.yaml | 13 ++ ...re-mirror-image-alternate-primary-nbd.yaml | 15 ++ .../workloads/compare-mirror-images-krbd.yaml | 13 ++ .../workloads/compare-mirror-images-nbd.yaml | 15 ++ .../compare_mirror_image_alternate_primary.sh | 106 +++++++++++ qa/workunits/rbd/compare_mirror_images.sh | 170 ++++++++++++++++++ 6 files changed, 332 insertions(+) create mode 100644 qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml create mode 100644 qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml create mode 100644 qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml create mode 100644 qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml create mode 100755 qa/workunits/rbd/compare_mirror_image_alternate_primary.sh create mode 100755 qa/workunits/rbd/compare_mirror_images.sh diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml new file mode 100644 index 0000000000000..771400d01eed6 --- /dev/null +++ b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-krbd.yaml @@ -0,0 +1,13 @@ +overrides: + install: + ceph: + extra_system_packages: + - pv +tasks: +- workunit: + clients: + cluster1.client.mirror: + - rbd/compare_mirror_image_alternate_primary.sh + env: + RBD_DEVICE_TYPE: 'krbd' + timeout: 3h diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml new file mode 100644 index 0000000000000..e87d0e8cecc69 --- /dev/null +++ b/qa/suites/rbd/mirror/workloads/compare-mirror-image-alternate-primary-nbd.yaml @@ -0,0 +1,15 @@ +overrides: + install: + ceph: + extra_packages: + - rbd-nbd + extra_system_packages: + - pv +tasks: +- workunit: + clients: + cluster1.client.mirror: + - rbd/compare_mirror_image_alternate_primary.sh + env: + RBD_DEVICE_TYPE: 'nbd' + timeout: 3h diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml new file mode 100644 index 0000000000000..fc161987f7bd2 --- /dev/null +++ b/qa/suites/rbd/mirror/workloads/compare-mirror-images-krbd.yaml @@ -0,0 +1,13 @@ +overrides: + install: + ceph: + extra_system_packages: + - pv +tasks: +- workunit: + clients: + cluster1.client.mirror: + - rbd/compare_mirror_images.sh + env: + RBD_DEVICE_TYPE: 'krbd' + timeout: 3h diff --git a/qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml b/qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml new file mode 100644 index 0000000000000..ed02ed25702fb --- /dev/null +++ b/qa/suites/rbd/mirror/workloads/compare-mirror-images-nbd.yaml @@ -0,0 +1,15 @@ +overrides: + install: + ceph: + extra_packages: + - rbd-nbd + extra_system_packages: + - pv +tasks: +- workunit: + clients: + cluster1.client.mirror: + - rbd/compare_mirror_images.sh + env: + RBD_DEVICE_TYPE: 'nbd' + timeout: 3h diff --git a/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh b/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh new file mode 100755 index 0000000000000..338f43f1e530d --- /dev/null +++ b/qa/workunits/rbd/compare_mirror_image_alternate_primary.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +set -ex + +IMAGE=image-alternate-primary +MIRROR_IMAGE_MODE=snapshot +MIRROR_POOL_MODE=image +MOUNT=test-alternate-primary +RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff' +RBD_MIRROR_INSTANCES=1 +RBD_MIRROR_MODE=snapshot +RBD_MIRROR_USE_EXISTING_CLUSTER=1 + +. $(dirname $0)/rbd_mirror_helpers.sh + +take_mirror_snapshots() { + local cluster=$1 + local pool=$2 + local image=$3 + + for i in {1..30}; do + mirror_image_snapshot $cluster $pool $image + sleep 3 + done +} + +slow_untar_workload() { + local mountpt=$1 + + cp linux-5.4.tar.gz $mountpt + # run workload that updates the data and metadata of multiple files on disk. + # rate limit the workload such that the mirror snapshots can be taken as the + # contents of the image are progressively changed by the workload. + local ret=0 + timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \ + | pv -L 256K | tar xf - -C $mountpt" || ret=$? + if ((ret != 124)); then + echo "Workload completed prematurely" + return 1 + fi +} + +setup + +start_mirrors ${CLUSTER1} +start_mirrors ${CLUSTER2} + +# initial setup +create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMAGE} \ + ${RBD_MIRROR_MODE} 10G + +if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \ + -o try-netlink ${POOL}/${IMAGE}) +elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \ + ${POOL}/${IMAGE}) +else + echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}" + exit 1 +fi +sudo mkfs.ext4 ${DEV} +mkdir ${MOUNT} + +wget https://download.ceph.com/qa/linux-5.4.tar.gz + +for i in {1..25}; do + # create mirror snapshots every few seconds under I/O + sudo mount ${DEV} ${MOUNT} + sudo chown $(whoami) ${MOUNT} + rm -rf ${MOUNT}/* + take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMAGE} & + SNAP_PID=$! + slow_untar_workload ${MOUNT} + wait $SNAP_PID + sudo umount ${MOUNT} + + # calculate hash before demotion of primary image + DEMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}') + sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} ${DEV} + + demote_image ${CLUSTER1} ${POOL} ${IMAGE} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMAGE} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${IMAGE} 'up+unknown' + promote_image ${CLUSTER2} ${POOL} ${IMAGE} + + # calculate hash after promotion of secondary image + if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \ + -o try-netlink ${POOL}/${IMAGE}) + elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${IMAGE}) + fi + PROMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}') + + if [[ "${DEMOTE_MD5}" != "${PROMOTE_MD5}" ]]; then + echo "Mismatch at iteration ${i}: ${DEMOTE_MD5} != ${PROMOTE_MD5}" + exit 1 + fi + + TEMP=${CLUSTER1} + CLUSTER1=${CLUSTER2} + CLUSTER2=${TEMP} +done + +echo OK diff --git a/qa/workunits/rbd/compare_mirror_images.sh b/qa/workunits/rbd/compare_mirror_images.sh new file mode 100755 index 0000000000000..3f45e20dc611a --- /dev/null +++ b/qa/workunits/rbd/compare_mirror_images.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +set -ex + +IMG_PREFIX=image-primary +MIRROR_IMAGE_MODE=snapshot +MIRROR_POOL_MODE=image +MNTPT_PREFIX=test-primary +RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff' +RBD_MIRROR_INSTANCES=1 +RBD_MIRROR_MODE=snapshot +RBD_MIRROR_USE_EXISTING_CLUSTER=1 + +. $(dirname $0)/rbd_mirror_helpers.sh + +take_mirror_snapshots() { + local cluster=$1 + local pool=$2 + local image=$3 + + for i in {1..30}; do + mirror_image_snapshot $cluster $pool $image + sleep 3 + done +} + +slow_untar_workload() { + local mountpt=$1 + + cp linux-5.4.tar.gz $mountpt + # run workload that updates the data and metadata of multiple files on disk. + # rate limit the workload such that the mirror snapshots can be taken as the + # contents of the image are progressively changed by the workload. + local ret=0 + timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \ + | pv -L 256K | tar xf - -C $mountpt" || ret=$? + if ((ret != 124)); then + echo "Workload completed prematurely" + return 1 + fi +} + +wait_for_image_removal() { + local cluster=$1 + local pool=$2 + local image=$3 + + for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do + if ! rbd --cluster $cluster ls $pool | grep -wq $image; then + return 0 + fi + sleep $s + done + + echo "image ${pool}/${image} not removed from cluster ${cluster}" + return 1 +} + +compare_demoted_promoted_image() { + local dev=${DEVS[$1-1]} + local img=${IMG_PREFIX}$1 + local mntpt=${MNTPT_PREFIX}$1 + local demote_md5 promote_md5 + + sudo umount ${mntpt} + + # calculate hash before demotion of primary image + demote_md5=$(sudo md5sum ${dev} | awk '{print $1}') + sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} \ + ${POOL}/${img} + + demote_image ${CLUSTER1} ${POOL} ${img} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${img} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${img} 'up+unknown' + promote_image ${CLUSTER2} ${POOL} ${img} + + # calculate hash after promotion of secondary image + if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then + dev=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \ + -o try-netlink ${POOL}/${img}) + elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then + dev=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${img}) + fi + promote_md5=$(sudo md5sum ${dev} | awk '{print $1}') + sudo rbd --cluster ${CLUSTER2} device unmap -t ${RBD_DEVICE_TYPE} ${dev} + + if [[ "${demote_md5}" != "${promote_md5}" ]]; then + echo "Mismatch for image ${POOL}/${img}: ${demote_md5} != ${promote_md5}" + return 1 + fi +} + +setup + +start_mirrors ${CLUSTER1} +start_mirrors ${CLUSTER2} + +wget https://download.ceph.com/qa/linux-5.4.tar.gz + +for i in {1..10}; do + DEVS=() + SNAP_PIDS=() + COMPARE_PIDS=() + WORKLOAD_PIDS=() + RET=0 + for j in {1..10}; do + IMG=${IMG_PREFIX}${j} + MNTPT=${MNTPT_PREFIX}${j} + create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMG} \ + ${RBD_MIRROR_MODE} 10G + if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \ + -o try-netlink ${POOL}/${IMG}) + elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then + DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \ + ${POOL}/${IMG}) + else + echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}" + exit 1 + fi + DEVS+=($DEV) + sudo mkfs.ext4 ${DEV} + mkdir ${MNTPT} + sudo mount ${DEV} ${MNTPT} + sudo chown $(whoami) ${MNTPT} + # create mirror snapshots under I/O every few seconds + take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMG} & + SNAP_PIDS+=($!) + slow_untar_workload ${MNTPT} & + WORKLOAD_PIDS+=($!) + done + for pid in ${SNAP_PIDS[@]}; do + wait $pid || RET=$? + done + if ((RET != 0)); then + echo "take_mirror_snapshots failed" + exit 1 + fi + for pid in ${WORKLOAD_PIDS[@]}; do + wait $pid || RET=$? + done + if ((RET != 0)); then + echo "slow_untar_workload failed" + exit 1 + fi + + for j in {1..10}; do + compare_demoted_promoted_image $j & + COMPARE_PIDS+=($!) + done + for pid in ${COMPARE_PIDS[@]}; do + wait $pid || RET=$? + done + if ((RET != 0)); then + echo "compare_demoted_promoted_image failed" + exit 1 + fi + + for j in {1..10}; do + IMG=${IMG_PREFIX}${j} + # Allow for removal of non-primary image by checking that mirroring + # image status is "up+replaying" + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMG} 'up+replaying' + remove_image ${CLUSTER2} ${POOL} ${IMG} + wait_for_image_removal ${CLUSTER1} ${POOL} ${IMG} + rm -rf ${MNTPT_PREFIX}${j} + done +done + +echo OK -- 2.39.5