From b4101c03a5ede2c30c6cbfbf1ec658411bc5c1aa Mon Sep 17 00:00:00 2001 From: John Agombar Date: Tue, 4 Mar 2025 14:24:43 +0000 Subject: [PATCH] qa/workunits/rbd: update to mirror group snapshot tests New tests: - force promote test with daemon running on both clusters - test_enable_mirroring_when_duplicate_group_exists - test_odf_failover_failback test - test_resync_marker test - test_force_promote_before_initial_sync test - scenarios in test_create_group_with_images_then_mirror_with_regular_snapshots Disabled tests: - test_force_promote scenarios 2 & 3 which repeatedly fail Renamed tests: - test_multiple_user_snapshot_time to test_multiple_mirror_group_snapshot_unlink_time - test_multiple_user_snapshot_whilst_stopped to test_multiple_mirror_group_snapshot_whilst_stopped Signed-off-by: John Agombar --- qa/workunits/rbd/rbd_mirror_group_simple.sh | 643 ++++++++++++++++---- qa/workunits/rbd/rbd_mirror_helpers.sh | 135 ++-- 2 files changed, 627 insertions(+), 151 deletions(-) diff --git a/qa/workunits/rbd/rbd_mirror_group_simple.sh b/qa/workunits/rbd/rbd_mirror_group_simple.sh index aada13a3494cf..0bcf5bd946a32 100755 --- a/qa/workunits/rbd/rbd_mirror_group_simple.sh +++ b/qa/workunits/rbd/rbd_mirror_group_simple.sh @@ -186,10 +186,22 @@ test_create_group_with_images_then_mirror() images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" } -declare -a test_enable_mirroring_when_duplicate_group_exists_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 5) - -test_enable_mirroring_when_duplicate_group_exists_scenarios=1 - +declare -a test_enable_mirroring_when_duplicate_group_exists_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'remove') +declare -a test_enable_mirroring_when_duplicate_group_exists_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'rename_secondary') +declare -a test_enable_mirroring_when_duplicate_group_exists_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'rename_primary') + +test_enable_mirroring_when_duplicate_group_exists_scenarios=2 +# TODO scenario 3 fails at the moment + +# This test does the following +# 1. create a group with images on primary site +# 2. create a group with the same name with images with the same name on the secondary site +# 3. enable mirroring on the primary site +# 4. take different actions to allow mirroring to proceed +# scenario 1 - delete the duplicate named group and images on the secondary +# scenario 2 - rename the duplicate named group and images on the secondary +# scenario 3 - rename the duplicate named group and images on the primary +# 5. check that group and all images are successfully mirrored to secondary test_enable_mirroring_when_duplicate_group_exists() { local primary_cluster=$1 ; shift @@ -198,6 +210,7 @@ test_enable_mirroring_when_duplicate_group_exists() local group=$1 ; shift local image_prefix=$1 ; shift local image_count=$(($1*"${image_multiplier}")) ; shift + local scenario=$1 ; shift group_create "${primary_cluster}" "${pool}/${group}" images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" @@ -209,23 +222,51 @@ test_enable_mirroring_when_duplicate_group_exists() mirror_group_enable "${primary_cluster}" "${pool}/${group}" + # group will be present on secondary, but won't be mirrored wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" "${image_count}" - check_daemon_running "${secondary_cluster}" -exit 0 -# TODO finish this test - group should not be mirrored on secondary at this point. -# peer status when looking at status on primary should be "up+stopped" -# could rename group on secondary without renaming images -# or could delete group on secondary. -# or rename group on primary? - # ceph --daemon mirror group status groupName - wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${image_count}" + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group}" 'down+unknown' + test_fields_in_group_info ${primary_cluster} ${pool}/${group} 'snapshot' 'enabled' 'true' check_daemon_running "${secondary_cluster}" - # rbd mirror group status groupName - #sleep 10 + if [ "${scenario}" = 'remove' ]; then + # remove the non-mirrored group on the secondary + group_remove "${secondary_cluster}" "${pool}/${group}" + elif [ "${scenario}" = 'rename_secondary' ]; then + group_rename "${secondary_cluster}" "${pool}/${group}" "${pool}/${group}_renamed" + elif [ "${scenario}" = 'rename_primary' ]; then + group_rename "${primary_cluster}" "${pool}/${group}" "${pool}/${group}_renamed" + group_orig="${group}" + group="${group}_renamed" + fi + + # group should now be mirrored, but images can't be + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+error' + test_fields_in_group_info ${secondary_cluster} ${pool}/${group} 'snapshot' 'enabled' 'false' + + if [ "${scenario}" = 'remove' ]; then + # remove the non-mirrored images on the secondary + images_remove "${secondary_cluster}" "${pool}/${image_prefix}" "${image_count}" + elif [ "${scenario}" = 'rename_secondary' ]; then + local i + for i in $(seq 0 $((image_count-1))); do + image_rename "${secondary_cluster}" "${pool}/${image_prefix}${i}" "${pool}/${image_prefix}_renamed${i}" + done + elif [ "${scenario}" = 'rename_primary' ]; then + local i + for i in $(seq 0 $((image_count-1))); do + image_rename "${primary_cluster}" "${pool}/${image_prefix}${i}" "${pool}/${image_prefix}_renamed${i}" + done + image_prefix_orig="${image_prefix}" + image_prefix="${image_prefix}_renamed" + fi + + # TODO scenario 3 fails on the next line - no images are listed in the group wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' "${image_count}" wait_for_group_synced "${primary_cluster}" "${pool}/${group}" + wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${image_count}" + check_daemon_running "${secondary_cluster}" + if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group}" 'down+unknown' 0 fi @@ -234,8 +275,15 @@ exit 0 wait_for_group_not_present "${primary_cluster}" "${pool}" "${group}" wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}" check_daemon_running "${secondary_cluster}" - images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" + + if [ "${scenario}" = 'rename_secondary' ]; then + group_remove "${secondary_cluster}" "${pool}/${group}_renamed" + images_remove "${secondary_cluster}" "${pool}/${image_prefix}_renamed" "${image_count}" + elif [ "${scenario}" = 'rename_primary' ]; then + group_remove "${secondary_cluster}" "${pool}${group_orig}" + images_remove "${secondary_cluster}" "${pool}/${image_prefix_orig}" "${image_count}" + fi } # record the time taken to enable and sync for a group with increasing number of images. @@ -330,7 +378,6 @@ declare -a test_create_group_stop_daemon_then_recreate_1=("${CLUSTER2}" "${CLUST declare -a test_create_group_stop_daemon_then_recreate_2=("${CLUSTER2}" "${CLUSTER1}" 'stop_restart_before_recreate') declare -a test_create_group_stop_daemon_then_recreate_3=("${CLUSTER2}" "${CLUSTER1}" 'stop_restart_after_recreate') -# TODO enable scenarios 2 and 3 when they pass test_create_group_stop_daemon_then_recreate_scenarios=3 test_create_group_stop_daemon_then_recreate() @@ -342,8 +389,6 @@ test_create_group_stop_daemon_then_recreate() local pool="${pool0}" local group="${group0}" - testlog "test_create_group_stop_daemon_then_recreate" - images_create "${primary_cluster}" "${pool}/${image_prefix}" 1 group_create "${primary_cluster}" "${pool}/${group}" @@ -385,7 +430,6 @@ test_create_group_stop_daemon_then_recreate() wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" 1 wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' 1 - # remove and recreate group - expect 2 keys due to bug TODO group_remove "${primary_cluster}" "${pool}/${group}" group_create "${primary_cluster}" "${pool}/${group}" mirror_group_enable "${primary_cluster}" "${pool}/${group}" @@ -407,7 +451,6 @@ test_create_group_stop_daemon_then_recreate() echo "restarting daemon on secondary" start_mirrors "${secondary_cluster}" - # TODO fails on next step - group is not present on secondary (restarting the daemon again will cause it to appear) wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" 0 wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" 0 wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' 0 @@ -419,9 +462,13 @@ test_create_group_stop_daemon_then_recreate() mirror_group_disable "${primary_cluster}" "${pool}/${group}" wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}" -# Not sure what the expected count is here - have asked Nithya -# count_omap_keys_with_filter "${secondary_cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count -# test "${key_count}" = 2 || fail "unexpected key count:${key_count}" + # Wait for rbd_mirror_leader to be empty + for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16; do + sleep ${s} + count_omap_keys_with_filter "${secondary_cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count + test "${key_count}" = 0 && break + done + test "${key_count}" = 0 || fail "unexpected key count:${key_count}" check_daemon_running "${secondary_cluster}" @@ -1071,9 +1118,11 @@ test_images_different_pools() } # create regular group snapshots and test replay -declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 5) +declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'remove_snap') +declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'leave_snap') +declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'force_disable') -test_create_group_with_images_then_mirror_with_regular_snapshots_scenarios=1 +test_create_group_with_images_then_mirror_with_regular_snapshots_scenarios=3 test_create_group_with_images_then_mirror_with_regular_snapshots() { @@ -1083,6 +1132,7 @@ test_create_group_with_images_then_mirror_with_regular_snapshots() local group=$1 ; shift local image_prefix=$1 ; shift local image_count=$(($1*"${image_multiplier}")) ; shift + local scenario=$1 ; shift local snap='regular_snap' @@ -1104,28 +1154,31 @@ test_create_group_with_images_then_mirror_with_regular_snapshots() group_snap_create "${primary_cluster}" "${pool}/${group}" "${snap}" check_group_snap_exists "${primary_cluster}" "${pool}/${group}" "${snap}" - # snap is currently copied to secondary cluster, where it remains in the "incomplete" state, but this is maybe incorrect - see slack thread TODO - # - should not be copied until mirrored. - mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}" - - check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}" + check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}" - group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}" - check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}" - # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO - mirror_group_snapshot "${primary_cluster}" "${pool}/${group}" mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}" - check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}" + check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}" - #TODO DEFECT - #exit 0 - # if I exit at this point and then - # - force disable mirroring for the group on the secondary - # - remove the group on the secondary - # we end up with snapshots that belong to the group being left lying around. - # see discussion in slack, might need defect + if [ "${scenario}" = 'remove_snap' ]; then + group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}" + check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}" + # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO + mirror_group_snapshot "${primary_cluster}" "${pool}/${group}" + mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}" + check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}" + else + check_group_snap_exists "${primary_cluster}" "${pool}/${group}" "${snap}" + check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}" + fi - #TODO also try taking multiple regular group snapshots and check the behaviour there + if [ "${scenario}" = 'force_disable' ]; then + # Force disable mirroring on the secondary and check that everything can be cleaned up + mirror_group_disable "${secondary_cluster}" "${pool}/${group}" '--force' + group_remove "${secondary_cluster}" "${pool}/${group}" + wait_for_group_present "${primary_cluster}" "${pool}" "${group}" "${image_count}" + wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}" + images_remove "${secondary_cluster}" "${pool}/${image_prefix}" "${image_count}" + fi mirror_group_disable "${primary_cluster}" "${pool}/${group}" group_remove "${primary_cluster}" "${pool}/${group}" @@ -1411,7 +1464,6 @@ test_stopped_daemon() echo "starting daemon" start_mirrors "${secondary_cluster}" - # TODO often fails on next step - group is not present on secondary. wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" $(("${group_image_count}"+1)) wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' $(("${group_image_count}"+1)) wait_for_group_synced "${primary_cluster}" "${pool}"/"${group}" @@ -1432,10 +1484,8 @@ test_stopped_daemon() fi else mirror_group_disable "${primary_cluster}" "${pool}/${group}" -# wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}" group_image_remove "${primary_cluster}" "${pool}/${group}" "${pool}/${image_name}" mirror_group_enable "${primary_cluster}" "${pool}/${group}" - # wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" $(("${group_image_count}")) fi get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group}" primary_group_snap_id @@ -1446,8 +1496,6 @@ test_stopped_daemon() wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${group_image_count}" wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' "${group_image_count}" - # TODO next command fails because rbd group snap list command fails with -2 - # though group does exist on secondary wait_for_group_synced "${primary_cluster}" "${pool}"/"${group}" get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group}" secondary_group_snap_id @@ -1973,9 +2021,10 @@ declare -a test_force_promote_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image declare -a test_force_promote_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_expand' 5) declare -a test_force_promote_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_shrink' 5) declare -a test_force_promote_4=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_rename' 5) +declare -a test_force_promote_5=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change_primary_up' 5) -# TODO scenarios 2, 3 and 4 are currently failing -test_force_promote_scenarios=3 +# TODO scenarios 2-5 are currently failing - 4 is low priority +test_force_promote_scenarios=1 test_force_promote() { @@ -1990,6 +2039,10 @@ test_force_promote() local snap0='snap_0' local snap1='snap_1' + if [ "${scenario}" = 'no_change_primary_up' ]; then + start_mirrors "${primary_cluster}" + fi + group_create "${primary_cluster}" "${pool}/${group0}" images_create "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1)) write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 @@ -2008,7 +2061,11 @@ test_force_promote() wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then - wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0 + if [ "${scenario}" = 'no_change_primary_up' ]; then + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' 0 + else + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0 + fi fi wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" @@ -2054,7 +2111,7 @@ test_force_promote() test_image_size_matches "${primary_cluster}" "${pool}/${image_prefix}3" $(("${image_size}"-4*1024*1024)) test_image_size_matches "${secondary_cluster}" "${pool}/${image_prefix}3" "${image_size}" mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}" - elif [ "${scenario}" = 'no_change' ]; then + elif [ "${scenario}" = 'no_change' ] || [ "${scenario}" = 'no_change_primary_up' ]; then mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}" fi @@ -2089,36 +2146,21 @@ test_force_promote() fi # stop the daemon to prevent further syncing of snapshots - stop_mirrors "${secondary_cluster}" + stop_mirrors "${secondary_cluster}" '-9' # check that latest snap is incomplete - ## this fails in the delete case as follows: - ##CEPH_ARGS='--id mirror' rbd --cluster cluster1 group snap list mirror/group_0 - ##ERR: rc= 2 test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" - # TODO remove - just capturing debug info - try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || : - try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || : - # force promote the group on the secondary - should rollback to the last complete snapshot local old_primary_cluster mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' - # TODO remove - just capturing debug info - try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || : - try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || : - old_primary_cluster="${primary_cluster}" primary_cluster="${secondary_cluster}" mirror_group_demote "${old_primary_cluster}" "${pool}/${group0}" secondary_cluster="${old_primary_cluster}" - # TODO remove - just capturing debug info - try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || : - try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || : - # Check that the rollback reverted the state if [ "${scenario}" = 'image_add' ]; then # check that new image is not present @@ -2135,15 +2177,23 @@ test_force_promote() test_image_size_matches "${primary_cluster}" "${pool}/${image_prefix}3" "${image_size}" || fail "size mismatch" fi + local group_id_before + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before + mirror_group_resync ${secondary_cluster} ${pool}/${group0} - start_mirrors "${secondary_cluster}" - sleep 5 + if [ "${scenario}" != 'no_change_primary_up' ]; then + start_mirrors "${secondary_cluster}" + sleep 5 + fi # TODO check that data can be copied back to original primary cluster # next line fails because latest snapshot on primary is never copied back to secondary # finish off the resync function # check that tidy up steps below work wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + local group_id_after + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated" compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}" @@ -2162,7 +2212,7 @@ test_force_promote() images_remove "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1)) image_remove "${primary_cluster}" "${pool}/${big_image}" - # Note: we altered primary and secondary cluster, so reset. + # Note: we altered primary and secondary cluster, so reset and restart daemon old_primary_cluster="${primary_cluster}" primary_cluster="${secondary_cluster}" secondary_cluster="${old_primary_cluster}" @@ -2204,8 +2254,10 @@ test_force_promote_delete_group() wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" # force promote the group on the secondary - # TODO disable mirror daemon here - see slack thread https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1739856204809159 + # disable mirror daemon here - see slack thread https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1739856204809159 + stop_mirrors "${secondary_cluster}" '-9' mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' + start_mirrors "${secondary_cluster}" wait_for_group_replay_stopped ${secondary_cluster} ${pool}/${group0} wait_for_group_replay_stopped ${primary_cluster} ${pool}/${group0} wait_for_group_status_in_pool_dir ${secondary_cluster} ${pool}/${group0} 'up+stopped' "${image_count}" @@ -2236,7 +2288,6 @@ test_force_promote_delete_group() wait_for_group_status_in_pool_dir ${primary_cluster} ${pool}/${group0} 'up+stopped' "${image_count}" wait_for_group_status_in_pool_dir ${secondary_cluster} ${pool}/${group0} 'up+stopped' $(("${image_count}"-1)) - # TODO - test normally fails on next line with missing images wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" $(("${image_count}"-1)) wait_for_group_present "${primary_cluster}" "${pool}" "${group0}" "${image_count}" @@ -2264,12 +2315,97 @@ test_force_promote_delete_group() stop_mirrors "${primary_cluster}" } +declare -a test_force_promote_before_initial_sync_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 5) + +test_force_promote_before_initial_sync_scenarios=1 + +test_force_promote_before_initial_sync() +{ + local primary_cluster=$1 ; shift + local secondary_cluster=$1 ; shift + local pool=$1 ; shift + local image_prefix=$1 ; shift + local image_count=$(($1*"${image_multiplier}")) ; shift + + local group0=test-group0 + + group_create "${primary_cluster}" "${pool}/${group0}" + images_create "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1)) + write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" $(("${image_count}"-1)) + + big_image=test-image-big + image_create "${primary_cluster}" "${pool}/${big_image}" 4G + # make some changes to the big image so that the sync will take a long time + write_image "${primary_cluster}" "${pool}" "${big_image}" 1024 4194304 + group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${big_image}" + + mirror_group_enable "${primary_cluster}" "${pool}/${group0}" + wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}" + + wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}" + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" + + if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0 + fi + + local group_snap_id + get_newest_group_snapshot_id "${primary_cluster}" "${pool}/${group0}" group_snap_id + wait_for_test_group_snap_present "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" 1 + + # stop the daemon to prevent further syncing of snapshots + stop_mirrors "${secondary_cluster}" '-9' + + # check that latest snap is incomplete + test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" + + # force promote the group on the secondary - TODO not sure if this should fail or not + # see https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1741107842904719?thread_ts=1740716823.395479&cid=C07J9Q2E268 + mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' + + # demote and try to resync again + mirror_group_demote "${secondary_cluster}" "${pool}/${group0}" + + mirror_group_resync ${secondary_cluster} ${pool}/${group0} + start_mirrors "${secondary_cluster}" + sleep 5 + + wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + + # try another force promote - this time it should work + stop_mirrors "${secondary_cluster}" '-9' + mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' + + # demote and try to resync again + mirror_group_demote "${secondary_cluster}" "${pool}/${group0}" + + mirror_group_resync ${secondary_cluster} ${pool}/${group0} + start_mirrors "${secondary_cluster}" + sleep 5 + + wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + + # tidy up + mirror_group_disable "${primary_cluster}" "${pool}/${group0}" + group_remove "${primary_cluster}" "${pool}/${group0}" + + wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}" + wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}" + + images_remove "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1)) + image_remove "${primary_cluster}" "${pool}/${big_image}" + + stop_mirrors "${primary_cluster}" + start_mirrors "${secondary_cluster}" +} + # test force unlink time -declare -a test_multiple_user_snapshot_time_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}") +declare -a test_multiple_mirror_group_snapshot_unlink_time_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}") -test_multiple_user_snapshot_time_scenarios=1 +test_multiple_mirror_group_snapshot_unlink_time_scenarios=1 -test_multiple_user_snapshot_time() +test_multiple_mirror_group_snapshot_unlink_time() { local primary_cluster=$1 local secondary_cluster=$2 @@ -2281,7 +2417,7 @@ test_multiple_user_snapshot_time() local time for image_count in "${image_counts[@]}"; do - test_multiple_user_snapshot_whilst_stopped "${primary_cluster}" "${secondary_cluster}" "${pool}" "${image_count}" time + test_multiple_mirror_group_snapshot_whilst_stopped "${primary_cluster}" "${secondary_cluster}" "${pool}" "${image_count}" time results+=(${time}) done @@ -2295,11 +2431,11 @@ test_multiple_user_snapshot_time() } # test force promote scenarios -declare -a test_multiple_user_snapshot_whilst_stopped_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" 5) +declare -a test_multiple_mirror_group_snapshot_whilst_stopped_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" 5) -test_multiple_user_snapshot_whilst_stopped_scenarios=1 +test_multiple_mirror_group_snapshot_whilst_stopped_scenarios=1 -test_multiple_user_snapshot_whilst_stopped() +test_multiple_mirror_group_snapshot_whilst_stopped() { local primary_cluster=$1 ; shift local secondary_cluster=$1 ; shift @@ -2323,11 +2459,14 @@ test_multiple_user_snapshot_whilst_stopped() wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}" wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" + wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + echo "stopping daemon on secondary" stop_mirrors "${secondary_cluster}" - # TODO starting the daemon on the primary seem to cause a problem with image deletion - Nithya investigating (see slack thread) - #echo "starting daemon on primary" - #start_mirrors "${primary_cluster}" + + local count + get_group_snap_count "${secondary_cluster}" "${pool}"/"${group0}" '*' count + test "${count}" = 1 || { fail "snap count = ${count}"; return 1; } local start_time end_time local times_result_arr=() @@ -2352,7 +2491,6 @@ test_multiple_user_snapshot_whilst_stopped() _average_snapshot_time=$((total/cnt)) fi - local count get_group_snap_count "${primary_cluster}" "${pool}"/"${group0}" '*' count test "${count}" -gt 3 || { fail "snap count = ${count}"; return 1; } @@ -2380,6 +2518,253 @@ test_multiple_user_snapshot_whilst_stopped() images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" } +# test ODF failover/failback sequence +declare -a test_odf_failover_failback_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'wait_before_promote' 3) +declare -a test_odf_failover_failback_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'retry_promote' 3) + +test_odf_failover_failback_scenarios=2 + +# ODF takes the following steps in failover/failback. This test does the same. +#Failover: +# rbd --cluster=site-b mirror group promote test_pool/test_group --force +# rbd --cluster=site-a mirror group demote test_pool/test_group +# rbd --cluster=site-a mirror group resync test_pool/test_group +# +#Failback: +# rbd --cluster=site-b mirror group demote test_pool/test_group +# rbd --cluster=site-b mirror group resync test_pool/test_group +# rbd --cluster=site-a mirror group promote test_pool/test_group +test_odf_failover_failback() +{ + local primary_cluster=$1 ; shift + local secondary_cluster=$1 ; shift + local pool=$1 ; shift + local image_prefix=$1 ; shift + local scenario=$1 ; shift + local image_count=$(($1*"${image_multiplier}")) ; shift + + local snap0='snap_0' + local snap1='snap_1' + + # ODF has daemon running on both clusters always + start_mirrors "${primary_cluster}" + + group_create "${primary_cluster}" "${pool}/${group0}" + images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" + write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" "${image_count}" + + create_snapshot "${primary_cluster}" "${pool}" "${image_prefix}0" "${snap0}" + compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}" + + mirror_group_enable "${primary_cluster}" "${pool}/${group0}" + wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}" + wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}" + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' + + wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}" + + # promote secondary (cluster1), demote original primary (cluster2) and request resync + stop_mirrors "${secondary_cluster}" '-9' + mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' + start_mirrors "${secondary_cluster}" + mirror_group_demote "${primary_cluster}" "${pool}/${group0}" + + local group_id_before group_id_after + get_id_from_group_info ${primary_cluster} ${pool}/${group0} group_id_before + mirror_group_resync "${primary_cluster}" "${pool}/${group0}" + + wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}" + + get_id_from_group_info ${primary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated" + + compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}" + + write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + compare_image_with_snapshot_expect_difference "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}" + mirror_group_snapshot_and_wait_for_sync_complete "${primary_cluster}" "${secondary_cluster}" "${pool}"/"${group0}" + compare_images "${secondary_cluster}" "${primary_cluster}" "${pool}" "${pool}" "${image_prefix}0" + + write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + + # failback to original primary (cluster2) + local group_snap_id_a group_snap_id_b + get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_a + get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group0}" group_snap_id_b + test "${group_snap_id_a}" = "${group_snap_id_b}" || fail "group not synced" + + # demote - neither site is primary + mirror_group_demote "${secondary_cluster}" "${pool}/${group0}" + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+unknown' + + # confirm that a new snapshot was taken by the demote operation + local group_snap_id_c + get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_c + test "${group_snap_id_a}" != "${group_snap_id_c}" || fail "new snap not taken by demote" + + local group_id_before group_id_after + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before + local image_id_before image_id_after + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_before + + # request resync - won't happen until other site is marked as primary + mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" + + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" = "${group_id_after}" || fail "group recreated with no primary" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" = "${image_id_after}" || fail "image recreated with no primary" + + if [ "${scenario}" = 'wait_before_promote' ]; then + # wait for the demote snapshot to be synced before promoting the other site + wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}" + + local group_snap_id_e group_snap_id_f + get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_e + get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group0}" group_snap_id_f + test "${group_snap_id_c}" = "${group_snap_id_e}" || fail "new snap on original secondary" + test "${group_snap_id_c}" = "${group_snap_id_f}" || fail "group not synced" + fi + + if [ "${scenario}" = 'retry_promote' ]; then + while true; do + { mirror_group_promote_try "${primary_cluster}" "${pool}/${group0}" && break; } || : + done + else + mirror_group_promote "${primary_cluster}" "${pool}/${group0}" + fi + + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' + + # Write some data, take a regular mirror snapshot, wait for it to sync on secondary cluster + write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group0}" + + # check that group and images were deleted and recreated on secondary cluster (as a result of the resync request) + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" != "${group_id_after}" || fail "group not recreated by resync" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" != "${image_id_after}" || fail "image not recreated by resync" + + group_remove "${primary_cluster}" "${pool}/${group0}" + wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}" + wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}" + + images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" + stop_mirrors "${primary_cluster}" + check_daemon_running "${secondary_cluster}" +} + +# test ODF failover/failback sequence +declare -a test_resync_marker_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change' 3) + +test_resync_marker_scenarios=1 + +# This test does the following: +# 1) setup a nothing-fancy group, write some data, let it sync from site-a to site-b +# 2) demote site-a +# 3) execute rbd mirror group resync command on site-b +# 4) assert that nothing happens at this point (on a bad build the group on site-b would be removed after some time, note that rbd-mirror daemons must be running for that to happen) +# 5) promote site-b +# 6) write some more data, let it sync from site-b to site-a +# 7) demote site-b +# 8) promote site-a +# 9) ensure that the group on site-b doesnt get resynced at this point +# 10) write some data on site-a and let it sync to site-b +# 11) check that site-b group id has not changed again - since just after step 5 +test_resync_marker() +{ + local primary_cluster=$1 ; shift + local secondary_cluster=$1 ; shift + local pool=$1 ; shift + local image_prefix=$1 ; shift + local scenario=$1 ; shift + local image_count=$(($1*"${image_multiplier}")) ; shift + + local snap0='snap_0' + local snap1='snap_1' + + # ODF has daemon running on both clusters always + start_mirrors "${primary_cluster}" + + group_create "${primary_cluster}" "${pool}/${group0}" + images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" + write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" "${image_count}" + + mirror_group_enable "${primary_cluster}" "${pool}/${group0}" + wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}" + wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}" + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}" + wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' + + wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" + + local group_id_before group_id_after + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before + local image_id_before image_id_after + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_before + + # demote primary and request resync on secondary - check that group does not get deleted (due to resync request flag) + mirror_group_demote "${primary_cluster}" "${pool}/${group0}" + mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" + wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+stopped' + + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" = "${group_id_after}" || fail "group recreated with no primary" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" = "${image_id_after}" || fail "image recreated with no primary" + + # TODO next command fails (note that without the resync command above it succeeds) + # 2025-03-06T19:13:47.722+0000 7f655045cb40 -1 librbd::api::Mirror: group_promote: group test-group0 is still primary within a remote cluster + mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" + + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" = "${group_id_after}" || fail "group recreated" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" = "${image_id_after}" || fail "image recreated" + + write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + mirror_group_snapshot_and_wait_for_sync_complete "${primary_cluster}" "${secondary_cluster}" "${pool}"/"${group0}" + + # demote - neither site is primary + mirror_group_demote "${secondary_cluster}" "${pool}/${group0}" + + # wait for the demote snapshot to be synced before promoting the other site + wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}" + + # promote original primary again + mirror_group_promote "${primary_cluster}" "${pool}/${group0}" + + # confirm that group and image are not recreated - resync flag was cleared + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" = "${group_id_after}" || fail "group recreated" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" = "${image_id_after}" || fail "image recreated" + + # write some data, take a snapshot and wait for sync to complete + write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096 + mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group0}" + + # check that group and image ids still not changed on secondary + get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after + test "${group_id_before}" = "${group_id_after}" || fail "group recreated" + get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after + test "${image_id_before}" = "${image_id_after}" || fail "image recreated" + + group_remove "${primary_cluster}" "${pool}/${group0}" + wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}" + wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}" + + images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}" + stop_mirrors "${primary_cluster}" + check_daemon_running "${secondary_cluster}" +} + # test resync scenarios declare -a test_resync_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change' 3) @@ -2423,7 +2808,7 @@ test_resync() echo "id = ${primary_group_snap_id}" # stop the daemon to prevent further syncing of snapshots - stop_mirrors "${secondary_cluster}" + stop_mirrors "${secondary_cluster}" '-9' # promote secondary and change data on image mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' @@ -2444,7 +2829,7 @@ test_resync() # Repeat the test this time changing the data on the primary too. # stop the daemon to prevent further syncing of snapshots - stop_mirrors "${secondary_cluster}" + stop_mirrors "${secondary_cluster}" '-9' # promote secondary and change data on image mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' @@ -2469,7 +2854,7 @@ test_resync() # Repeat the test this time swapping the primary and secondary and resyncing back to the new secondary. # stop the daemon to prevent further syncing of snapshots - stop_mirrors "${secondary_cluster}" + stop_mirrors "${secondary_cluster}" '-9' # promote secondary mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force' @@ -2504,6 +2889,29 @@ test_resync() start_mirrors "${secondary_cluster}" } +check_for_no_keys() +{ + local primary_cluster=$1 + local secondary_cluster=$2 + local cluster pools pool key_count obj_count + + for cluster in ${primary_cluster} ${secondary_cluster}; do + local pools + pools=$(CEPH_ARGS='' ceph --cluster ${cluster} osd pool ls | grep -v "^\." | xargs) + + for pool in ${pools}; do + # see if the rbd_mirror_leader object exists in the pool + get_pool_obj_count "${cluster}" "${pool}" "rbd_mirror_leader" obj_count + + # if it does then check that there are no entries left in it + if [ $obj_count -gt 0 ]; then + count_omap_keys_with_filter "${cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count + test "${key_count}" = 0 || fail "last test left keys" + fi + done + done +} + run_test() { local test_name=$1 @@ -2511,6 +2919,37 @@ run_test() declare -n test_parameters="$test_name"_"$test_scenario" + local primary_cluster=cluster2 + local secondary_cluster=cluster1 + + # If the tmpdir and cluster conf file exist then reuse the existing cluster + # but stop the daemon on the primary if it was left running by the last test + # and check that there are no unexpected objects left + if [ -d "${RBD_MIRROR_TEMDIR}" ] && [ -f "${RBD_MIRROR_TEMDIR}"'/cluster1.conf' ] + then + export RBD_MIRROR_USE_EXISTING_CLUSTER=1 + + # need to call this before checking the current state + setup_tempdir + + # look at every pool on both clusters and check that there are no entries leftover in rbd_image_leader + check_for_no_keys "${primary_cluster}" "${secondary_cluster}" + + # if the "mirror" pool doesn't exist then call setup to recreate all the required pools + local pool_count + get_pool_count "${primary_cluster}" 'mirror' pool_count + if [ 0 = ${pool_count} ]; then + setup + fi + else + setup + fi + + # stop mirror daemon if it has been left running on the primary cluster + stop_mirrors "${primary_cluster}" + # restart mirror daemon if it has been stopped on the secondary cluster + start_mirrors "${secondary_cluster}" + testlog "TEST:$test_name scenario:$test_scenario parameters:" "${test_parameters[@]}" "$test_name" "${test_parameters[@]}" } @@ -2553,17 +2992,17 @@ run_all_tests() run_test_all_scenarios test_force_promote run_test_all_scenarios test_resync run_test_all_scenarios test_remote_namespace - run_test_all_scenarios test_multiple_user_snapshot_whilst_stopped + run_test_all_scenarios test_multiple_mirror_group_snapshot_whilst_stopped run_test_all_scenarios test_create_group_with_image_remove_then_repeat + run_test_all_scenarios test_enable_disable_repeat run_test_all_scenarios test_empty_group_omap_keys #run_test_all_scenarios test_group_with_clone_image - run_test_all_scenarios test_multiple_user_snapshot_time + run_test_all_scenarios test_multiple_mirror_group_snapshot_unlink_time run_test_all_scenarios test_force_promote_delete_group run_test_all_scenarios test_create_group_stop_daemon_then_recreate - #run_test_all_scenarios test_enable_mirroring_when_duplicate_group_exists - - #FIXME: This test leaves residual groups on secondary moving it to the end. - run_test_all_scenarios test_enable_disable_repeat + run_test_all_scenarios test_enable_mirroring_when_duplicate_group_exists + run_test_all_scenarios test_odf_failover_failback + #run_test_all_scenarios test_resync_marker } if [ -n "${RBD_MIRROR_SHOW_CLI_CMD}" ]; then @@ -2572,22 +3011,6 @@ else set -ex fi -# If the tmpdir and cluster conf file exist then reuse the existing cluster -if [ -d "${RBD_MIRROR_TEMDIR}" ] && [ -f "${RBD_MIRROR_TEMDIR}"'/cluster1.conf' ] -then - export RBD_MIRROR_USE_EXISTING_CLUSTER=1 -fi - -setup - -# see if we need to (re)start rbd-mirror deamon -pid=$(cat "$(daemon_pid_file "${CLUSTER1}")" 2>/dev/null) || : -if [ -z "${pid}" ] -then - start_mirrors "${CLUSTER1}" -fi -check_daemon_running "${CLUSTER1}" - # restore the arguments from the cli set -- "${args[@]}" diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh index 211696a3c055f..7da7a72d68093 100755 --- a/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -187,6 +187,8 @@ run_cmd_internal() { export CEPH_ARGS="--id ${CEPH_ID}" fi + echo "${cmd}" >> "${TEMPDIR}/rbd-mirror.cmd.log" + # Don't exit immediately if the command exits with a non-zero status. set +e $cmd >"${CMD_STDOUT}" 2>"${CMD_STDERR}" @@ -288,6 +290,11 @@ daemon_pid_file() echo $(ceph-conf --cluster $cluster --name "client.${MIRROR_USER_ID_PREFIX}${instance}" 'pid file') } +echo_red() +{ + echo -e "${RED}$@${NO_COLOUR}" +} + testlog() { echo -e "${RED}"$(date '+%F %T') $@ "${NO_COLOUR}"| tee -a "${TEMPDIR}/rbd-mirror.test.log" >&2 @@ -489,8 +496,8 @@ setup() if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then setup_cluster "${CLUSTER1}" setup_cluster "${CLUSTER2}" + setup_dummy_objects "${CLUSTER1}" fi - setup_dummy_objects "${CLUSTER1}" setup_pools "${CLUSTER1}" "${CLUSTER2}" setup_pools "${CLUSTER2}" "${CLUSTER1}" @@ -957,7 +964,7 @@ get_newest_mirror_snapshot() local log=$4 rbd --cluster "${cluster}" snap list --all "${pool}/${image}" --format xml | \ - $XMLSTARLET sel -t -c "//snapshots/snapshot[namespace/complete='true' and position()=last()]" > \ + $XMLSTARLET sel -t -c "(//snapshots/snapshot[namespace/complete='true'])[last()]" > \ ${log} || true } @@ -1089,6 +1096,18 @@ test_fields_in_group_info() test "${fields_arr[5]}" = "${expected_is_primary}" || { fail "primary = ${fields_arr[5]}"; return 1; } } +get_id_from_group_info() +{ + local cluster=$1 ; shift + local group_spec=$1 ; shift + local -n _result=$1 ; shift + + local fields=(//group/group_id) + local fields_arr + get_fields_from_group_info "${cluster}" "${group_spec}" fields_arr "${fields[@]}" + _result="${fields_arr[0]}" +} + get_fields_from_mirror_image_status() { local cluster=$1 ; shift @@ -1472,7 +1491,7 @@ create_snapshot() local image=$3 local snap=$4 - rbd --cluster ${cluster} snap create ${pool}/${image}@${snap} + run_cmd "rbd --cluster ${cluster} snap create ${pool}/${image}@${snap}" } remove_snapshot() @@ -1916,6 +1935,16 @@ get_image_id() sed -ne 's/^.*block_name_prefix: rbd_data\.//p' } +get_image_id2() +{ + local cluster=$1 + local image_spec=$2 + local -n _id=$3 + + run_cmd "rbd --cluster ${cluster} info ${image_spec} --format xml --pretty-format" + _id=$($XMLSTARLET sel -t -v "//image/id" "$CMD_STDOUT") || { fail "no id!"; return; } +} + get_image_mirroring_global_id() { local cluster=$1 @@ -2192,7 +2221,12 @@ mirror_group_disable() local cluster=$1 ; shift local group_spec=$1 ; shift - run_cmd "rbd --cluster=${cluster} mirror group disable $* ${group_spec}" + local force + if [ -n "$1" ]; then + force=$1; shift + fi + + run_cmd "rbd --cluster=${cluster} mirror group disable $* ${group_spec} ${force}" } create_group_and_enable_mirror() @@ -2218,8 +2252,16 @@ mirror_group_promote() local cluster=$1 local group_spec=$2 local force=$3 + local runner=${4:-"run_cmd"} + + "$runner" "rbd --cluster=${cluster} mirror group promote ${group_spec} ${force}" +} + +mirror_group_promote_try() +{ + local force=${3:-''} - run_cmd "rbd --cluster=${cluster} mirror group promote ${group_spec} ${force}" + mirror_group_promote "$@" "${force}" "try_cmd" } mirror_group_snapshot() @@ -2280,6 +2322,31 @@ get_group_snap_name() _group_snap_name="$($XMLSTARLET sel -t -v "//group_snaps/group_snap[id='${snap_id}']/snapshot" < "$CMD_STDOUT")" } +get_pool_count() +{ + local cluster=$1 + local pool_name=$2 + local -n _count=$3 + + run_cmd "ceph --cluster ${cluster} osd pool ls --format xml-pretty" + if [ "${pool_name}" = '*' ]; then + _count="$($XMLSTARLET sel -t -v "count(//pools/pool_name)" < "$CMD_STDOUT")" + else + _count="$($XMLSTARLET sel -t -v "count(//pools[pool_name='${pool_name}'])" < "$CMD_STDOUT")" + fi +} + +get_pool_obj_count() +{ + local cluster=$1 + local pool=$2 + local obj_name=$3 + local -n _count=$4 + + run_cmd "rados --cluster ${cluster} -p ${pool} ls --format xml-pretty" + _count="$($XMLSTARLET sel -t -v "count(//objects/object[name='${obj_name}'])" < "$CMD_STDOUT")" +} + get_image_snap_id_from_group_snap_info() { local cluster=$1 @@ -2633,13 +2700,12 @@ get_newest_group_snapshot_id() # TODO - have seen this next cmd fail with rc=2 and an empty list # this should not happen, but if it does then retry as a temp workaround try_cmd "rbd --cluster ${cluster} group snap list ${group_spec} --format xml --pretty-format" && - { _group_snap_id=$(xmlstarlet sel -t -v "//group_snaps/group_snap[state='complete' and position()=last()]/id" "$CMD_STDOUT" ); return; } - + { _group_snap_id=$(xmlstarlet sel -t -v "(//group_snaps/group_snap[state='complete']/id)[last()]" "$CMD_STDOUT" ); return; } for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16; do echo -e "${RED}RETRYING COMMAND${NO_COLOUR}"; sleep ${s} try_cmd "rbd --cluster ${cluster} group snap list ${group_spec} --format xml --pretty-format" && { - _group_snap_id=$(xmlstarlet sel -t -v "//group_snaps/group_snap[state='complete' and position()=last()]/id" "$CMD_STDOUT" ); return; } + _group_snap_id=$(xmlstarlet sel -t -v "(//group_snaps/group_snap[state='complete']/id)[last()]" "$CMD_STDOUT" ); return; } done fail "Failed to execute command" return 1 @@ -2767,52 +2833,39 @@ wait_for_group_status_in_pool_dir() return 1 } -# useful function that attempts to delete all groups and images -tidy() +stop_daemons_on_clusters() { - local primary_cluster=cluster2 - local secondary_cluster=cluster1 - local cluster pool group group_spec image image_spec + local cluster_list=$1 + local cluster - for cluster in ${primary_cluster} ${secondary_cluster}; do + for cluster in ${cluster_list}; do echo 'cluster:'${cluster} stop_mirrors ${cluster} '-9' done +} - for cluster in ${primary_cluster} ${secondary_cluster}; do +delete_pools_on_clusters() +{ + local cluster_list=$1 + local cluster + + for cluster in ${cluster_list}; do echo 'cluster:'${cluster} for pool in $(CEPH_ARGS='' ceph --cluster ${cluster} osd pool ls | grep -v "^\." | xargs); do echo 'pool:'${pool} run_admin_cmd "ceph --cluster ${cluster} osd pool delete ${pool} ${pool} --yes-i-really-really-mean-it" done done +} - # following is old method that used to remove individual object rather than removing entire pools - : ' - for cluster in ${primary_cluster} ${secondary_cluster}; do - echo 'cluster:'${cluster} - for pool in "${POOL}" "${PARENT_POOL}" "${POOL}/${NS1}" "${POOL}/${NS2}"; do - echo 'pool:'${pool} - for group in $(rbd --cluster ${cluster} group list ${pool} | xargs); do - group_spec=${pool}/${group} - echo 'group_spec:'${group_spec} - mirror_group_disable ${cluster} ${group_spec} '--force' - for image_spec in $(rbd --cluster ${cluster} group image list ${group_spec} | xargs); do - echo 'image_spec:'"${image_spec}" - mirror_image_disable ${cluster} ${image_spec} '--force' - group_image_remove ${cluster} ${group_spec} ${image_spec} - done - group_remove ${cluster} ${group_spec} - done - for image in $(rbd --cluster ${cluster} list ${pool} | xargs); do - image_spec=${pool}/${image} - echo 'image_spec:'"${image_spec}" - mirror_image_disable ${cluster} ${image_spec} '--force' - image_remove ${cluster} ${image_spec} - done - done - done - ' +# stops all daemons and deletes all pools (groups and images included) +tidy() +{ + local primary_cluster=cluster2 + local secondary_cluster=cluster1 + + stop_daemons_on_clusters "${primary_cluster} ${secondary_cluster}" + delete_pools_on_clusters "${primary_cluster} ${secondary_cluster}" } # list all groups, images and snaps -- 2.39.5