From 2e344ec42499fa7c905a030096c9d1ae8da78b90 Mon Sep 17 00:00:00 2001
From: John Agombar <agombar@uk.ibm.com>
Date: Thu, 13 Mar 2025 14:37:57 +0000
Subject: [PATCH] qa/workunits/rbd: update to mirror group snapshot tests

Update run_test_secnarios function to support a non-contiguous sequence of scenario numbers
Remove assert that checked empty omap keys between tests - now just logs to testlog

New tests:
- test_odf_failover_failback - new scenario with resync request on test_odf_failover_failback

Disabled tests:
- test_force_promote all scenarios fail since test is now checking group
  consistency during rollback

Signed-off-by: John Agombar <agombar@uk.ibm.com>
---
 qa/workunits/rbd/rbd_mirror_group_simple.sh | 159 +++++++++++++-------
 qa/workunits/rbd/rbd_mirror_helpers.sh      | 102 ++++++++++---
 2 files changed, 187 insertions(+), 74 deletions(-)

diff --git a/qa/workunits/rbd/rbd_mirror_group_simple.sh b/qa/workunits/rbd/rbd_mirror_group_simple.sh
index 0bcf5bd946a..00ae6d2c143 100755
--- a/qa/workunits/rbd/rbd_mirror_group_simple.sh
+++ b/qa/workunits/rbd/rbd_mirror_group_simple.sh
@@ -1003,19 +1003,6 @@ test_group_with_clone_image()
   image_remove "${primary_cluster}" "${pool}/child_image"
 }
 
-test_from_nithya_that_will_stop_working_when_api_changes()
-{
-[root@server1 build]# rbd-a group create data/grp1
-[root@server1 build]# rbd-a group image add data/grp1 data/img-1
-[root@server1 build]# rbd-a group image add data/grp1 data/img-2
-[root@server1 build]# rbd-a group image add data/grp1 data/img-3
-[root@server1 build]# rbd-a mirror group enable data/grp1
-[root@server1 build]# rbd-a mirror image demote data/img-2
-[root@server1 build]# rbd-a mirror group snapshot data/grp1
-[root@server1 build]# rbd-a snap ls --all data/img-3
-[root@server1 build]# rbd-a group snap ls data/grp1
-}
-
 # test two empty groups
 declare -a test_empty_groups_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${group1}")
 
@@ -1162,7 +1149,7 @@ test_create_group_with_images_then_mirror_with_regular_snapshots()
   if [ "${scenario}" = 'remove_snap' ]; then
     group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}"
     check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}"
-    # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO
+    # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO - coding leftover 38
     mirror_group_snapshot "${primary_cluster}" "${pool}/${group}"
     mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
     check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
@@ -1229,7 +1216,8 @@ test_create_group_with_regular_snapshots_then_mirror()
 
   group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}"
   check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}"
-  # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO
+  # this next extra mirror_group_snapshot should not be needed - waiting for fix  - coding leftover 38
+
   mirror_group_snapshot "${primary_cluster}" "${pool}/${group}"
   mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
   check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
@@ -1387,7 +1375,7 @@ test_create_group_with_multiple_images_do_io()
 
   group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}"
   check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}"
-  # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO
+  # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO - coding leftover 38
   mirror_group_snapshot "${primary_cluster}" "${pool}/${group}"
   mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
   check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
@@ -2023,8 +2011,9 @@ declare -a test_force_promote_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image
 declare -a test_force_promote_4=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_rename' 5)
 declare -a test_force_promote_5=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change_primary_up' 5)
 
-# TODO scenarios 2-5 are currently failing - 4 is low priority
-test_force_promote_scenarios=1
+# TODO scenarios 2-4 are currently failing - 4 is low priority
+# test_force_promote_scenarios=5
+test_force_promote_scenarios='1 5'
 
 test_force_promote()
 {
@@ -2052,7 +2041,10 @@ test_force_promote()
 
   big_image=test-image-big
   image_create "${primary_cluster}" "${pool}/${big_image}" 4G
+  write_image "${primary_cluster}" "${pool}" "${big_image}" 1024 4096
   group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${big_image}"
+  create_snapshot "${primary_cluster}" "${pool}" "${big_image}" "${snap0}"
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${big_image}" "${primary_cluster}" "${pool}/${big_image}@${snap0}"
 
   mirror_group_enable "${primary_cluster}" "${pool}/${group0}"
   wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}"
@@ -2062,7 +2054,7 @@ test_force_promote()
 
   if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
     if [ "${scenario}" = 'no_change_primary_up' ]; then
-      wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' 0
+      wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' "${image_count}"
     else
       wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0
     fi
@@ -2070,12 +2062,14 @@ test_force_promote()
 
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
   compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${big_image}" "${primary_cluster}" "${pool}/${big_image}@${snap0}"
 
   write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
   create_snapshot "${primary_cluster}" "${pool}" "${image_prefix}0" "${snap1}"
 
   # make some changes to the big image so that the next sync will take a long time
   write_image "${primary_cluster}" "${pool}" "${big_image}" 1024 4194304
+  create_snapshot "${primary_cluster}" "${pool}" "${big_image}" "${snap1}"
 
   local global_id
   local image_size
@@ -2115,15 +2109,6 @@ test_force_promote()
     mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}"
   fi
 
-  # TODO add the following test
-: '
-  # This test removes and recreates an image - it fails currently as the request to list the group snaps on the secondary fails
-  group_image_remove "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}0"
-  image_remove "${primary_cluster}" "${pool}/${image_prefix}0" 
-  image_create "${primary_cluster}" "${pool}/${image_prefix}0" maybe different size?
-  group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}0"
-'
-
   local group_snap_id
   get_newest_group_snapshot_id "${primary_cluster}" "${pool}/${group0}" group_snap_id
   echo "id = ${group_snap_id}"
@@ -2143,6 +2128,12 @@ test_force_promote()
     wait_for_image_size_matches "${secondary_cluster}" "${pool}/${image_prefix}2" $(("${image_size}"+4*1024*1024))
   elif [ "${scenario}" = 'image_shrink' ]; then
     wait_for_image_size_matches "${secondary_cluster}" "${pool}/${image_prefix}3" $(("${image_size}"-4*1024*1024))
+  elif [ "${scenario}" = 'no_change' ] || [ "${scenario}" = 'no_change_primary_up' ]; then
+    local snap_id
+    get_newest_mirror_snapshot_id_on_primary "${primary_cluster}" "${pool}/${image_prefix}0" snap_id
+    echo "image_snap_id = ${snap_id}"
+    wait_for_snap_id_present "${secondary_cluster}" "${pool}/${image_prefix}0" "${snap_id}"
+    wait_for_snapshot_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}" "${pool}" "${image_prefix}0" "${snap_id}"
   fi
 
   # stop the daemon to prevent further syncing of snapshots
@@ -2151,6 +2142,11 @@ test_force_promote()
   # check that latest snap is incomplete
   test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" 
 
+  # check that the big image is incomplete
+  local big_image_snap_id
+  get_newest_mirror_snapshot_id_on_primary "${primary_cluster}" "${pool}/${big_image}" big_image_snap_id
+  test_snap_complete "${secondary_cluster}" "${pool}/${big_image}" "${big_image_snap_id}" 'false' || fail "big image is synced"
+
   # force promote the group on the secondary - should rollback to the last complete snapshot
   local old_primary_cluster
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
@@ -2161,6 +2157,10 @@ test_force_promote()
   mirror_group_demote "${old_primary_cluster}" "${pool}/${group0}"
   secondary_cluster="${old_primary_cluster}"
 
+  # check that we rolled back to snap0 state
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}0" "${secondary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${big_image}" "${secondary_cluster}" "${pool}/${big_image}@${snap0}"
+
   # Check that the rollback reverted the state 
   if [ "${scenario}" = 'image_add' ]; then
     # check that new image is not present
@@ -2178,24 +2178,22 @@ test_force_promote()
   fi
 
   local group_id_before
-  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before
+  get_id_from_group_info "${secondary_cluster}" "${pool}/${group0}" group_id_before
 
-  mirror_group_resync ${secondary_cluster} ${pool}/${group0}
+  mirror_group_resync "${secondary_cluster}" "${pool}/${group0}"
 
   if [ "${scenario}" != 'no_change_primary_up' ]; then
     start_mirrors "${secondary_cluster}"
     sleep 5
   fi  
-# TODO check that data can be copied back to original primary cluster
-# next line fails because latest snapshot on primary is never copied back to secondary
-# finish off the resync function
-# check that tidy up steps below work
+
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
   local group_id_after
-  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
-  test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated"
+  get_id_from_group_info "${secondary_cluster}" "${pool}/${group0}" group_id_after
+  test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated by resync"
 
-  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${secondary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${big_image}" "${secondary_cluster}" "${pool}/${big_image}@${snap0}"
 
   # Check that snapshots work on the new primary
   mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}" group_snap_id
@@ -2216,6 +2214,7 @@ test_force_promote()
   old_primary_cluster="${primary_cluster}"
   primary_cluster="${secondary_cluster}"
   secondary_cluster="${old_primary_cluster}"
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
   start_mirrors "${secondary_cluster}"
 }
@@ -2312,6 +2311,7 @@ test_force_promote_delete_group()
 
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
 
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
 }
 
@@ -2396,6 +2396,7 @@ test_force_promote_before_initial_sync()
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1))
   image_remove "${primary_cluster}" "${pool}/${big_image}"
 
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
   start_mirrors "${secondary_cluster}"
 }
@@ -2521,18 +2522,21 @@ test_multiple_mirror_group_snapshot_whilst_stopped()
 # test ODF failover/failback sequence
 declare -a test_odf_failover_failback_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'wait_before_promote' 3)
 declare -a test_odf_failover_failback_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'retry_promote' 3)
+declare -a test_odf_failover_failback_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'resync_on_failback' 1)
 
-test_odf_failover_failback_scenarios=2
+test_odf_failover_failback_scenarios=3
 
 # ODF takes the following steps in failover/failback.  This test does the same.
-#Failover:
+# Failover:
 # rbd --cluster=site-b mirror group promote test_pool/test_group --force
+#
+# When site-a comes alive again request a resync
 # rbd --cluster=site-a mirror group demote test_pool/test_group
 # rbd --cluster=site-a mirror group resync test_pool/test_group
 #
-#Failback:
+# Failback:
 # rbd --cluster=site-b mirror group demote test_pool/test_group
-# rbd --cluster=site-b mirror group resync test_pool/test_group
+#  (scenario 3 requests a resync on site-b here)
 # rbd --cluster=site-a mirror group promote test_pool/test_group
 test_odf_failover_failback()
 {
@@ -2570,6 +2574,7 @@ test_odf_failover_failback()
   stop_mirrors "${secondary_cluster}" '-9'
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
   start_mirrors "${secondary_cluster}"
+  # original site comes alive again
   mirror_group_demote "${primary_cluster}" "${pool}/${group0}"
 
   local group_id_before group_id_after
@@ -2610,15 +2615,17 @@ test_odf_failover_failback()
   local image_id_before image_id_after
   get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_before
   
-  # request resync - won't happen until other site is marked as primary
-  mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" 
+  if [ "${scenario}" = 'resync_on_failback' ]; then
+    # request resync - won't happen until other site is marked as primary
+    mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" 
+  fi  
 
   get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
   test "${group_id_before}" = "${group_id_after}" || fail "group recreated with no primary"
   get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
   test "${image_id_before}" = "${image_id_after}" || fail "image recreated with no primary"
 
-  if [ "${scenario}" = 'wait_before_promote' ]; then
+  if [ "${scenario}" != 'retry_promote' ]; then
     # wait for the demote snapshot to be synced before promoting the other site
     wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}"
 
@@ -2644,17 +2651,20 @@ test_odf_failover_failback()
   write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
   mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group0}"
     
-  # check that group and images were deleted and recreated on secondary cluster (as a result of the resync request)
-  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
-  test "${group_id_before}" != "${group_id_after}" || fail "group not recreated by resync"
-  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
-  test "${image_id_before}" != "${image_id_after}" || fail "image not recreated by resync"
+  if [ "${scenario}" = 'resync_on_failback' ]; then
+    # check that group and images were deleted and recreated on secondary cluster (as a result of the resync request)
+    get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+    test "${group_id_before}" != "${group_id_after}" || fail "group not recreated by resync"
+    get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+    test "${image_id_before}" != "${image_id_after}" || fail "image not recreated by resync"
+  fi  
 
   group_remove "${primary_cluster}" "${pool}/${group0}"
   wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}"
   wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
 
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
   check_daemon_running "${secondary_cluster}"
 }
@@ -2761,6 +2771,7 @@ test_resync_marker()
   wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
 
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
   check_daemon_running "${secondary_cluster}"
 }
@@ -2885,6 +2896,7 @@ test_resync()
   images_remove "${secondary_cluster}" "${pool}/${image_prefix}" "${image_count}"
 
   # reset: start the right daemons for the next test
+  wait_for_no_keys "${primary_cluster}"
   stop_mirrors "${primary_cluster}"
   start_mirrors "${secondary_cluster}"
 }
@@ -2906,12 +2918,39 @@ check_for_no_keys()
       # if it does then check that there are no entries left in it
       if [ $obj_count -gt 0 ]; then
         count_omap_keys_with_filter "${cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
-        test "${key_count}" = 0 || fail "last test left keys" 
+        test "${key_count}" = 0 || testlog "last test left keys" 
       fi
     done
   done    
 }
 
+wait_for_no_keys()
+{
+  local cluster=$1
+  local pools pool key_count obj_count
+
+  local pools
+  pools=$(CEPH_ARGS='' ceph --cluster "${cluster}" osd pool ls  | grep -v "^\." | xargs)
+
+  for pool in ${pools}; do
+    # see if the rbd_mirror_leader object exists in the pool
+    get_pool_obj_count "${cluster}" "${pool}" "rbd_mirror_leader" obj_count
+
+    # if it does then wait until there are no entries left in it
+    if [ "${obj_count}" -gt 0 ]; then
+      count_omap_keys_with_filter "${cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
+      if [ "${key_count}" -gt 0 ]; then
+        for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+          sleep ${s}
+          count_omap_keys_with_filter "${cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
+          test "${key_count}" = 0 && break
+        done
+        test "${key_count}" = 0 || testlog "waiting did not clear leftover entries"
+      fi
+    fi
+  done
+}
+
 run_test()
 {
   local test_name=$1
@@ -2959,10 +2998,23 @@ run_test_all_scenarios()
 {
   local test_name=$1
 
-  declare -n test_scenario_count="$test_name"_scenarios
+  declare -n test_scenarios="$test_name"_scenarios
+
+  # test_scenarios can either be a number or a sequence of numbers
+  # in the former case it should be the number of the maximum valid scenario
+  # in the later case it should be a sequence of valid scenario numbers
+  # The later case is required when a non-contiguous sequnence of scenario numbers is valid
+  local working_test_scenarios 
+  if [[ $test_scenarios =~ ^[0-9]+$ ]]
+  then
+    working_test_scenarios=$(seq 1 $test_scenarios)
+  else
+    working_test_scenarios=$test_scenarios
+  fi
+  echo "Scenarios to run : ${working_test_scenarios}"
 
   local loop
-  for loop in $(seq 1 $test_scenario_count); do
+  for loop in $working_test_scenarios; do
     run_test $test_name $loop
   done
 }
@@ -2989,7 +3041,7 @@ run_all_tests()
   run_test_all_scenarios test_stopped_daemon
   run_test_all_scenarios test_create_group_with_regular_snapshots_then_mirror
   run_test_all_scenarios test_image_move_group
-  run_test_all_scenarios test_force_promote
+  #run_test_all_scenarios test_force_promote
   run_test_all_scenarios test_resync
   run_test_all_scenarios test_remote_namespace
   run_test_all_scenarios test_multiple_mirror_group_snapshot_whilst_stopped
@@ -3003,6 +3055,7 @@ run_all_tests()
   run_test_all_scenarios test_enable_mirroring_when_duplicate_group_exists
   run_test_all_scenarios test_odf_failover_failback
   #run_test_all_scenarios test_resync_marker
+  #run_test_all_scenarios test_force_promote_before_initial_sync
 }
 
 if [ -n "${RBD_MIRROR_SHOW_CLI_CMD}" ]; then
diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh
index 7da7a72d680..00c5a1f6d9f 100755
--- a/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -956,42 +956,102 @@ mirror_image_snapshot()
     rbd --cluster "${cluster}" mirror image snapshot "${pool}/${image}"
 }
 
-get_newest_mirror_snapshot()
+# get the primary_snap_id for the most recent complete snap on the secondary cluster
+get_primary_snap_id_for_newest_mirror_snapshot_on_secondary()
 {
-    local cluster=$1
-    local pool=$2
-    local image=$3
-    local log=$4
+    local secondary_cluster=$1
+    local image_spec=$2
+    local -n _snap_id=$3
+
+    run_cmd "rbd --cluster ${secondary_cluster} snap list --all ${image_spec} --format xml --pretty-format" 
+    _snap_id=$(xmlstarlet sel -t -v "(//snapshots/snapshot/namespace[complete='true']/primary_snap_id)[last()]" "$CMD_STDOUT" )
+}
+
+# get the snap_id for the most recent complete snap on the primary cluster
+get_newest_mirror_snapshot_id_on_primary()
+{
+    local primary_cluster=$1
+    local image_spec=$2
+    local -n _snap_id=$3
+
+    run_cmd "rbd --cluster ${primary_cluster} snap list --all ${image_spec} --format xml --pretty-format" 
+    _snap_id=$(xmlstarlet sel -t -v "(//snapshots/snapshot[namespace/complete='true']/id)[last()]" "$CMD_STDOUT" )
+}
+
+test_snap_present()
+{
+    local secondary_cluster=$1
+    local image_spec=$2
+    local snap_id=$3
+    local expected_snap_count=$4
 
-    rbd --cluster "${cluster}" snap list --all "${pool}/${image}" --format xml | \
-        $XMLSTARLET sel -t -c "(//snapshots/snapshot[namespace/complete='true'])[last()]" > \
-        ${log} || true
+    run_cmd "rbd --cluster ${secondary_cluster} snap list -a ${image_spec} --format xml --pretty-format" 
+    test "${expected_snap_count}" = "$($XMLSTARLET sel -t -v "count(//snapshots/snapshot/namespace[primary_snap_id='${snap_id}'])" < "$CMD_STDOUT")" || { fail; return 1; }
+}
+
+test_snap_complete()
+{
+    local secondary_cluster=$1
+    local image_spec=$2
+    local snap_id=$3
+    local expected_complete=$4
+
+    run_cmd "rbd --cluster ${secondary_cluster} snap list -a ${image_spec} --format xml --pretty-format" 
+    test "${expected_complete}" = "$($XMLSTARLET sel -t -v "//snapshots/snapshot/namespace[primary_snap_id='${snap_id}']/complete" < "$CMD_STDOUT")" || { fail; return 1; }
+}
+
+wait_for_test_snap_present()
+{
+    local secondary_cluster=$1
+    local image_spec=$2
+    local snap_id=$3
+    local test_snap_count=$4
+    local s
+
+    for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16 32 32; do
+        sleep ${s}
+        test_snap_present "${secondary_cluster}" "${image_spec}" "${snap_id}" "${test_snap_count}" && return 0
+    done
+
+    fail "wait for count of snaps with id ${snap_id} to be ${test_snap_count} failed on ${secondary_cluster}"
+    return 1
+}
+
+wait_for_snap_id_present()
+{
+    local secondary_cluster=$1
+    local image_spec=$2
+    local snap_id=$3
+
+    wait_for_test_snap_present "${secondary_cluster}" "${image_spec}" "${snap_id}" 1
+}
+
+wait_for_snap_id_not_present()
+{
+    local secondary_cluster=$1
+    local image_spec=$2
+    local snap_id=$3
+
+    wait_for_test_snap_present "${secondary_cluster}" "${image_spec}" "${snap_id}" 0
 }
 
 wait_for_snapshot_sync_complete()
 {
-    local local_cluster=$1
+    local local_cluster=$1 
     local cluster=$2
     local local_pool=$3
     local remote_pool=$4
     local image=$5
 
-    local status_log=${TEMPDIR}/$(mkfname ${cluster}-${remote_pool}-${image}.status)
-    local local_status_log=${TEMPDIR}/$(mkfname ${local_cluster}-${local_pool}-${image}.status)
-
-    get_newest_mirror_snapshot "${cluster}" "${remote_pool}" "${image}" "${status_log}"
-    local snapshot_id=$(xmlstarlet sel -t -v "//snapshot/id" < ${status_log})
+    local primary_snapshot_id snapshot_id
+    get_newest_mirror_snapshot_id_on_primary "${cluster}" "${remote_pool}/${image}" primary_snapshot_id
 
     while true; do
         for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16 32 32; do
             sleep ${s}
-
-            get_newest_mirror_snapshot "${local_cluster}" "${local_pool}" "${image}" "${local_status_log}"
-            local primary_snapshot_id=$(xmlstarlet sel -t -v "//snapshot/namespace/primary_snap_id" < ${local_status_log})
-
+            get_primary_snap_id_for_newest_mirror_snapshot_on_secondary "${local_cluster}" "${local_pool}/${image}" snapshot_id
             test "${snapshot_id}" = "${primary_snapshot_id}" && return 0
         done
-
         return 1
     done
     return 1
@@ -999,8 +1059,8 @@ wait_for_snapshot_sync_complete()
 
 wait_for_replay_complete()
 {
-    local local_cluster=$1
-    local cluster=$2
+    local local_cluster=$1 #sec
+    local cluster=$2 #pri
     local local_pool=$3
     local remote_pool=$4
     local image=$5
-- 
2.39.5