From b4101c03a5ede2c30c6cbfbf1ec658411bc5c1aa Mon Sep 17 00:00:00 2001
From: John Agombar <agombar@uk.ibm.com>
Date: Tue, 4 Mar 2025 14:24:43 +0000
Subject: [PATCH] qa/workunits/rbd: update to mirror group snapshot tests

New tests:
- force promote test with daemon running on both clusters
- test_enable_mirroring_when_duplicate_group_exists
- test_odf_failover_failback test
- test_resync_marker test
- test_force_promote_before_initial_sync test
- scenarios in test_create_group_with_images_then_mirror_with_regular_snapshots

Disabled tests:
- test_force_promote scenarios 2 & 3 which repeatedly fail

Renamed tests:
- test_multiple_user_snapshot_time to test_multiple_mirror_group_snapshot_unlink_time
- test_multiple_user_snapshot_whilst_stopped to test_multiple_mirror_group_snapshot_whilst_stopped

Signed-off-by: John Agombar <agombar@uk.ibm.com>
---
 qa/workunits/rbd/rbd_mirror_group_simple.sh | 643 ++++++++++++++++----
 qa/workunits/rbd/rbd_mirror_helpers.sh      | 135 ++--
 2 files changed, 627 insertions(+), 151 deletions(-)

diff --git a/qa/workunits/rbd/rbd_mirror_group_simple.sh b/qa/workunits/rbd/rbd_mirror_group_simple.sh
index aada13a3494cf..0bcf5bd946a32 100755
--- a/qa/workunits/rbd/rbd_mirror_group_simple.sh
+++ b/qa/workunits/rbd/rbd_mirror_group_simple.sh
@@ -186,10 +186,22 @@ test_create_group_with_images_then_mirror()
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
 }
 
-declare -a test_enable_mirroring_when_duplicate_group_exists_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 5)
-
-test_enable_mirroring_when_duplicate_group_exists_scenarios=1
-
+declare -a test_enable_mirroring_when_duplicate_group_exists_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'remove')
+declare -a test_enable_mirroring_when_duplicate_group_exists_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'rename_secondary')
+declare -a test_enable_mirroring_when_duplicate_group_exists_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'rename_primary')
+
+test_enable_mirroring_when_duplicate_group_exists_scenarios=2
+# TODO scenario 3 fails at the moment
+
+# This test does the following
+# 1. create a group with images on primary site
+# 2. create a group with the same name with images with the same name on the secondary site
+# 3. enable mirroring on the primary site 
+# 4. take different actions to allow mirroring to proceed
+#    scenario 1 - delete the duplicate named group and images on the secondary
+#    scenario 2 - rename the duplicate named group and images on the secondary
+#    scenario 3 - rename  the duplicate named group and images on the primary
+# 5. check that group and all images are successfully mirrored to secondary
 test_enable_mirroring_when_duplicate_group_exists()
 {
   local primary_cluster=$1 ; shift
@@ -198,6 +210,7 @@ test_enable_mirroring_when_duplicate_group_exists()
   local group=$1 ; shift
   local image_prefix=$1 ; shift
   local image_count=$(($1*"${image_multiplier}")) ; shift
+  local scenario=$1 ; shift
 
   group_create "${primary_cluster}" "${pool}/${group}"
   images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
@@ -209,23 +222,51 @@ test_enable_mirroring_when_duplicate_group_exists()
   
   mirror_group_enable "${primary_cluster}" "${pool}/${group}"
 
+  # group will be present on secondary, but won't be mirrored
   wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" "${image_count}"
-  check_daemon_running "${secondary_cluster}"
-exit 0
-# TODO finish this test - group should not be mirrored on secondary at this point. 
-# peer status when looking at status on primary should be "up+stopped"
-# could rename group on secondary without renaming images
-# or could delete group on secondary.
-# or rename group on primary?
-  # ceph --daemon mirror group status groupName
-  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${image_count}"
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group}" 'down+unknown'
+  test_fields_in_group_info ${primary_cluster} ${pool}/${group} 'snapshot' 'enabled' 'true'
   check_daemon_running "${secondary_cluster}"
 
-  # rbd mirror group status groupName
-  #sleep 10
+  if [ "${scenario}" = 'remove' ]; then
+    # remove the non-mirrored group on the secondary
+    group_remove "${secondary_cluster}" "${pool}/${group}"
+  elif  [ "${scenario}" = 'rename_secondary' ]; then
+    group_rename "${secondary_cluster}" "${pool}/${group}" "${pool}/${group}_renamed"
+  elif  [ "${scenario}" = 'rename_primary' ]; then
+    group_rename "${primary_cluster}" "${pool}/${group}" "${pool}/${group}_renamed"
+    group_orig="${group}"
+    group="${group}_renamed"
+  fi
+
+  # group should now be mirrored, but images can't be
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+error'
+  test_fields_in_group_info ${secondary_cluster} ${pool}/${group} 'snapshot' 'enabled' 'false'
+
+  if [ "${scenario}" = 'remove' ]; then
+    # remove the non-mirrored images on the secondary
+    images_remove "${secondary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  elif  [ "${scenario}" = 'rename_secondary' ]; then
+    local i
+    for i in $(seq 0 $((image_count-1))); do
+      image_rename "${secondary_cluster}" "${pool}/${image_prefix}${i}" "${pool}/${image_prefix}_renamed${i}"
+    done
+  elif  [ "${scenario}" = 'rename_primary' ]; then
+    local i
+    for i in $(seq 0 $((image_count-1))); do
+      image_rename "${primary_cluster}" "${pool}/${image_prefix}${i}" "${pool}/${image_prefix}_renamed${i}"
+    done
+    image_prefix_orig="${image_prefix}"
+    image_prefix="${image_prefix}_renamed"
+  fi
+
+  # TODO scenario 3 fails on the next line - no images are listed in the group 
   wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' "${image_count}"
   wait_for_group_synced "${primary_cluster}" "${pool}/${group}"
 
+  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${image_count}"
+  check_daemon_running "${secondary_cluster}"
+
   if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
     wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group}" 'down+unknown' 0
   fi
@@ -234,8 +275,15 @@ exit 0
   wait_for_group_not_present "${primary_cluster}" "${pool}" "${group}"
   wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}"
   check_daemon_running "${secondary_cluster}"
-
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+
+  if [ "${scenario}" = 'rename_secondary' ]; then
+    group_remove "${secondary_cluster}" "${pool}/${group}_renamed"
+    images_remove "${secondary_cluster}" "${pool}/${image_prefix}_renamed" "${image_count}"
+  elif  [ "${scenario}" = 'rename_primary' ]; then
+    group_remove "${secondary_cluster}" "${pool}${group_orig}"
+    images_remove "${secondary_cluster}" "${pool}/${image_prefix_orig}" "${image_count}"
+  fi
 }
 
 # record the time taken to enable and sync for a group with increasing number of images.
@@ -330,7 +378,6 @@ declare -a test_create_group_stop_daemon_then_recreate_1=("${CLUSTER2}" "${CLUST
 declare -a test_create_group_stop_daemon_then_recreate_2=("${CLUSTER2}" "${CLUSTER1}" 'stop_restart_before_recreate')
 declare -a test_create_group_stop_daemon_then_recreate_3=("${CLUSTER2}" "${CLUSTER1}" 'stop_restart_after_recreate')
 
-# TODO enable scenarios 2 and 3 when they pass
 test_create_group_stop_daemon_then_recreate_scenarios=3
 
 test_create_group_stop_daemon_then_recreate()
@@ -342,8 +389,6 @@ test_create_group_stop_daemon_then_recreate()
   local pool="${pool0}"
   local group="${group0}"
 
-  testlog "test_create_group_stop_daemon_then_recreate"
-
   images_create "${primary_cluster}" "${pool}/${image_prefix}" 1
   group_create "${primary_cluster}" "${pool}/${group}"
 
@@ -385,7 +430,6 @@ test_create_group_stop_daemon_then_recreate()
     wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" 1
     wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' 1
 
-    # remove and recreate group - expect 2 keys due to bug TODO
     group_remove "${primary_cluster}" "${pool}/${group}"
     group_create "${primary_cluster}" "${pool}/${group}"
     mirror_group_enable "${primary_cluster}" "${pool}/${group}"
@@ -407,7 +451,6 @@ test_create_group_stop_daemon_then_recreate()
     echo "restarting daemon on secondary"
     start_mirrors "${secondary_cluster}"
 
-    # TODO fails on next step - group is not present on secondary (restarting the daemon again will cause it to appear)
     wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" 0
     wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" 0
     wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' 0
@@ -419,9 +462,13 @@ test_create_group_stop_daemon_then_recreate()
   mirror_group_disable "${primary_cluster}" "${pool}/${group}"
   wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}"
 
-# Not sure what the expected count is here - have asked Nithya
-#  count_omap_keys_with_filter "${secondary_cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
-#  test "${key_count}" = 2 || fail "unexpected key count:${key_count}"
+  # Wait for rbd_mirror_leader to be empty
+  for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16; do
+    sleep ${s}
+    count_omap_keys_with_filter "${secondary_cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
+    test "${key_count}" = 0 && break
+  done
+  test "${key_count}" = 0 || fail "unexpected key count:${key_count}"
 
   check_daemon_running "${secondary_cluster}"
 
@@ -1071,9 +1118,11 @@ test_images_different_pools()
 }
 
 # create regular group snapshots and test replay
-declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 5)
+declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'remove_snap')
+declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'leave_snap')
+declare -a test_create_group_with_images_then_mirror_with_regular_snapshots_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${group0}" "${image_prefix}" 2 'force_disable')
 
-test_create_group_with_images_then_mirror_with_regular_snapshots_scenarios=1
+test_create_group_with_images_then_mirror_with_regular_snapshots_scenarios=3
 
 test_create_group_with_images_then_mirror_with_regular_snapshots()
 {
@@ -1083,6 +1132,7 @@ test_create_group_with_images_then_mirror_with_regular_snapshots()
   local group=$1 ; shift
   local image_prefix=$1 ; shift
   local image_count=$(($1*"${image_multiplier}")) ; shift
+  local scenario=$1 ; shift
 
   local snap='regular_snap'
 
@@ -1104,28 +1154,31 @@ test_create_group_with_images_then_mirror_with_regular_snapshots()
 
   group_snap_create "${primary_cluster}" "${pool}/${group}" "${snap}"
   check_group_snap_exists "${primary_cluster}" "${pool}/${group}" "${snap}"
-  # snap is currently copied to secondary cluster, where it remains in the "incomplete" state, but this is maybe incorrect - see slack thread TODO
-  # - should not be copied until mirrored.
-  mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
-
-  check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}"
+  check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
 
-  group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}"
-  check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}"
-  # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO
-  mirror_group_snapshot "${primary_cluster}" "${pool}/${group}"
   mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
-  check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
+  check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}"
 
-  #TODO DEFECT
-  #exit 0
-  # if I exit at this point and then
-  # - force disable mirroring for the group on the secondary
-  # - remove the group on the secondary
-  # we end up with snapshots that belong to the group being left lying around.
-  # see discussion in slack, might need defect
+  if [ "${scenario}" = 'remove_snap' ]; then
+    group_snap_remove "${primary_cluster}" "${pool}/${group}" "${snap}"
+    check_group_snap_doesnt_exist "${primary_cluster}" "${pool}/${group}" "${snap}"
+    # this next extra mirror_group_snapshot should not be needed - waiting for fix TODO
+    mirror_group_snapshot "${primary_cluster}" "${pool}/${group}"
+    mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group}"
+    check_group_snap_doesnt_exist "${secondary_cluster}" "${pool}/${group}" "${snap}"
+  else
+    check_group_snap_exists "${primary_cluster}" "${pool}/${group}" "${snap}"
+    check_group_snap_exists "${secondary_cluster}" "${pool}/${group}" "${snap}"
+  fi
 
-  #TODO also try taking multiple regular group snapshots and check the behaviour there
+  if [ "${scenario}" = 'force_disable' ]; then
+    # Force disable mirroring on the secondary and check that everything can be cleaned up
+    mirror_group_disable "${secondary_cluster}" "${pool}/${group}" '--force'
+    group_remove "${secondary_cluster}" "${pool}/${group}"
+    wait_for_group_present "${primary_cluster}" "${pool}" "${group}" "${image_count}"
+    wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}"
+    images_remove "${secondary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  fi  
 
   mirror_group_disable "${primary_cluster}" "${pool}/${group}"
   group_remove "${primary_cluster}" "${pool}/${group}"
@@ -1411,7 +1464,6 @@ test_stopped_daemon()
 
   echo "starting daemon"
   start_mirrors "${secondary_cluster}"
-  # TODO often fails on next step - group is not present on secondary.
   wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" $(("${group_image_count}"+1))
   wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' $(("${group_image_count}"+1))
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group}"
@@ -1432,10 +1484,8 @@ test_stopped_daemon()
     fi
   else
     mirror_group_disable "${primary_cluster}" "${pool}/${group}"
-#    wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group}"
     group_image_remove "${primary_cluster}" "${pool}/${group}" "${pool}/${image_name}" 
     mirror_group_enable "${primary_cluster}" "${pool}/${group}"
-   # wait_for_group_present "${secondary_cluster}" "${pool}" "${group}" $(("${group_image_count}"))
   fi
 
   get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group}" primary_group_snap_id
@@ -1446,8 +1496,6 @@ test_stopped_daemon()
 
   wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group}" "${group_image_count}"
   wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group}" 'up+replaying' "${group_image_count}"
-  # TODO next command fails because rbd group snap list command fails with -2
-  # though group does exist on secondary
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group}"
 
   get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group}" secondary_group_snap_id
@@ -1973,9 +2021,10 @@ declare -a test_force_promote_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image
 declare -a test_force_promote_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_expand' 5)
 declare -a test_force_promote_3=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_shrink' 5)
 declare -a test_force_promote_4=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'image_rename' 5)
+declare -a test_force_promote_5=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change_primary_up' 5)
 
-# TODO scenarios 2, 3 and 4 are currently failing
-test_force_promote_scenarios=3
+# TODO scenarios 2-5 are currently failing - 4 is low priority
+test_force_promote_scenarios=1
 
 test_force_promote()
 {
@@ -1990,6 +2039,10 @@ test_force_promote()
   local snap0='snap_0'
   local snap1='snap_1'
 
+  if [ "${scenario}" = 'no_change_primary_up' ]; then
+    start_mirrors "${primary_cluster}"
+  fi
+
   group_create "${primary_cluster}" "${pool}/${group0}"
   images_create "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1))
   write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
@@ -2008,7 +2061,11 @@ test_force_promote()
   wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
 
   if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
-    wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0
+    if [ "${scenario}" = 'no_change_primary_up' ]; then
+      wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' 0
+    else
+      wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0
+    fi
   fi
 
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
@@ -2054,7 +2111,7 @@ test_force_promote()
     test_image_size_matches "${primary_cluster}" "${pool}/${image_prefix}3" $(("${image_size}"-4*1024*1024))
     test_image_size_matches "${secondary_cluster}" "${pool}/${image_prefix}3" "${image_size}"
     mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}"
-  elif [ "${scenario}" = 'no_change' ]; then
+  elif [ "${scenario}" = 'no_change' ] || [ "${scenario}" = 'no_change_primary_up' ]; then
     mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}"
   fi
 
@@ -2089,36 +2146,21 @@ test_force_promote()
   fi
 
   # stop the daemon to prevent further syncing of snapshots
-  stop_mirrors "${secondary_cluster}"
+  stop_mirrors "${secondary_cluster}" '-9'
 
   # check that latest snap is incomplete
-    ## this fails in the delete case as follows:
-    ##CEPH_ARGS='--id mirror' rbd --cluster cluster1 group snap list mirror/group_0
-    ##ERR: rc= 2
   test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" 
 
-  # TODO remove - just capturing debug info
-  try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || :
-  try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || :
-
   # force promote the group on the secondary - should rollback to the last complete snapshot
   local old_primary_cluster
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
 
-  # TODO remove - just capturing debug info
-  try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || :
-  try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || :
-
   old_primary_cluster="${primary_cluster}"
   primary_cluster="${secondary_cluster}"
 
   mirror_group_demote "${old_primary_cluster}" "${pool}/${group0}"
   secondary_cluster="${old_primary_cluster}"
 
-  # TODO remove - just capturing debug info
-  try_cmd "rbd --cluster ${secondary_cluster} group snap list ${pool}/${group0}" || :
-  try_cmd "rbd --cluster ${primary_cluster} group snap list ${pool}/${group0}" || :
-
   # Check that the rollback reverted the state 
   if [ "${scenario}" = 'image_add' ]; then
     # check that new image is not present
@@ -2135,15 +2177,23 @@ test_force_promote()
     test_image_size_matches "${primary_cluster}" "${pool}/${image_prefix}3" "${image_size}" || fail "size mismatch"
   fi
 
+  local group_id_before
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before
+
   mirror_group_resync ${secondary_cluster} ${pool}/${group0}
 
-  start_mirrors "${secondary_cluster}"
-  sleep 5
+  if [ "${scenario}" != 'no_change_primary_up' ]; then
+    start_mirrors "${secondary_cluster}"
+    sleep 5
+  fi  
 # TODO check that data can be copied back to original primary cluster
 # next line fails because latest snapshot on primary is never copied back to secondary
 # finish off the resync function
 # check that tidy up steps below work
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+  local group_id_after
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated"
 
   compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
 
@@ -2162,7 +2212,7 @@ test_force_promote()
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1))
   image_remove "${primary_cluster}" "${pool}/${big_image}"
 
-  # Note: we altered primary and secondary cluster, so reset.
+  # Note: we altered primary and secondary cluster, so reset and restart daemon
   old_primary_cluster="${primary_cluster}"
   primary_cluster="${secondary_cluster}"
   secondary_cluster="${old_primary_cluster}"
@@ -2204,8 +2254,10 @@ test_force_promote_delete_group()
   wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
 
   # force promote the group on the secondary 
-  # TODO disable mirror daemon here - see slack thread https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1739856204809159
+  # disable mirror daemon here - see slack thread https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1739856204809159
+  stop_mirrors "${secondary_cluster}" '-9'
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
+  start_mirrors "${secondary_cluster}"
   wait_for_group_replay_stopped ${secondary_cluster} ${pool}/${group0}
   wait_for_group_replay_stopped ${primary_cluster} ${pool}/${group0}
   wait_for_group_status_in_pool_dir ${secondary_cluster} ${pool}/${group0} 'up+stopped' "${image_count}"
@@ -2236,7 +2288,6 @@ test_force_promote_delete_group()
   wait_for_group_status_in_pool_dir ${primary_cluster} ${pool}/${group0} 'up+stopped' "${image_count}"
   wait_for_group_status_in_pool_dir ${secondary_cluster} ${pool}/${group0} 'up+stopped' $(("${image_count}"-1))
 
-  # TODO - test normally fails on next line with missing images
   wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" $(("${image_count}"-1))
   wait_for_group_present "${primary_cluster}" "${pool}" "${group0}" "${image_count}"
 
@@ -2264,12 +2315,97 @@ test_force_promote_delete_group()
   stop_mirrors "${primary_cluster}"
 }
 
+declare -a test_force_promote_before_initial_sync_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 5)
+
+test_force_promote_before_initial_sync_scenarios=1
+
+test_force_promote_before_initial_sync()
+{
+  local primary_cluster=$1 ; shift
+  local secondary_cluster=$1 ; shift
+  local pool=$1 ; shift
+  local image_prefix=$1 ; shift
+  local image_count=$(($1*"${image_multiplier}")) ; shift
+
+  local group0=test-group0
+
+  group_create "${primary_cluster}" "${pool}/${group0}"
+  images_create "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1))
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" $(("${image_count}"-1))
+
+  big_image=test-image-big
+  image_create "${primary_cluster}" "${pool}/${big_image}" 4G
+  # make some changes to the big image so that the sync will take a long time
+  write_image "${primary_cluster}" "${pool}" "${big_image}" 1024 4194304
+  group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${big_image}"
+
+  mirror_group_enable "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}"
+
+  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}"
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+
+  if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
+    wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'down+unknown' 0
+  fi
+
+  local group_snap_id
+  get_newest_group_snapshot_id "${primary_cluster}" "${pool}/${group0}" group_snap_id
+  wait_for_test_group_snap_present "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" 1
+
+  # stop the daemon to prevent further syncing of snapshots
+  stop_mirrors "${secondary_cluster}" '-9'
+
+  # check that latest snap is incomplete
+  test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}" 
+
+  # force promote the group on the secondary - TODO not sure if this should fail or not
+  # see https://ibm-systems-storage.slack.com/archives/C07J9Q2E268/p1741107842904719?thread_ts=1740716823.395479&cid=C07J9Q2E268
+  mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
+
+  # demote and try to resync again
+  mirror_group_demote "${secondary_cluster}" "${pool}/${group0}"
+
+  mirror_group_resync ${secondary_cluster} ${pool}/${group0}
+  start_mirrors "${secondary_cluster}"
+  sleep 5
+
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+
+  # try another force promote - this time it should work
+  stop_mirrors "${secondary_cluster}" '-9'
+  mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
+
+  # demote and try to resync again
+  mirror_group_demote "${secondary_cluster}" "${pool}/${group0}"
+
+  mirror_group_resync ${secondary_cluster} ${pool}/${group0}
+  start_mirrors "${secondary_cluster}"
+  sleep 5
+
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+
+  # tidy up
+  mirror_group_disable "${primary_cluster}" "${pool}/${group0}"
+  group_remove "${primary_cluster}" "${pool}/${group0}"
+
+  wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}"
+  wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
+
+  images_remove "${primary_cluster}" "${pool}/${image_prefix}" $(("${image_count}"-1))
+  image_remove "${primary_cluster}" "${pool}/${big_image}"
+
+  stop_mirrors "${primary_cluster}"
+  start_mirrors "${secondary_cluster}"
+}
+
 # test force unlink time
-declare -a test_multiple_user_snapshot_time_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}")
+declare -a test_multiple_mirror_group_snapshot_unlink_time_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}")
 
-test_multiple_user_snapshot_time_scenarios=1
+test_multiple_mirror_group_snapshot_unlink_time_scenarios=1
 
-test_multiple_user_snapshot_time()
+test_multiple_mirror_group_snapshot_unlink_time()
 {
   local primary_cluster=$1
   local secondary_cluster=$2
@@ -2281,7 +2417,7 @@ test_multiple_user_snapshot_time()
   local time
 
   for image_count in "${image_counts[@]}"; do
-    test_multiple_user_snapshot_whilst_stopped "${primary_cluster}" "${secondary_cluster}" "${pool}" "${image_count}" time
+    test_multiple_mirror_group_snapshot_whilst_stopped "${primary_cluster}" "${secondary_cluster}" "${pool}" "${image_count}" time
     results+=(${time})
   done
 
@@ -2295,11 +2431,11 @@ test_multiple_user_snapshot_time()
 }
 
 # test force promote scenarios
-declare -a test_multiple_user_snapshot_whilst_stopped_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" 5)
+declare -a test_multiple_mirror_group_snapshot_whilst_stopped_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" 5)
 
-test_multiple_user_snapshot_whilst_stopped_scenarios=1
+test_multiple_mirror_group_snapshot_whilst_stopped_scenarios=1
 
-test_multiple_user_snapshot_whilst_stopped()
+test_multiple_mirror_group_snapshot_whilst_stopped()
 {
   local primary_cluster=$1 ; shift
   local secondary_cluster=$1 ; shift
@@ -2323,11 +2459,14 @@ test_multiple_user_snapshot_whilst_stopped()
   wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}"
   wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
 
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+
   echo "stopping daemon on secondary"
   stop_mirrors "${secondary_cluster}"
-  # TODO starting the daemon on the primary seem to cause a problem with image deletion - Nithya investigating (see slack thread)
-  #echo "starting daemon on primary"
-  #start_mirrors "${primary_cluster}"
+
+  local count
+  get_group_snap_count "${secondary_cluster}" "${pool}"/"${group0}" '*' count
+  test "${count}" = 1 || { fail "snap count = ${count}"; return 1; }
 
   local start_time end_time
   local times_result_arr=()
@@ -2352,7 +2491,6 @@ test_multiple_user_snapshot_whilst_stopped()
     _average_snapshot_time=$((total/cnt))
   fi
 
-  local count
   get_group_snap_count "${primary_cluster}" "${pool}"/"${group0}" '*' count
   test "${count}" -gt 3 || { fail "snap count = ${count}"; return 1; }
 
@@ -2380,6 +2518,253 @@ test_multiple_user_snapshot_whilst_stopped()
   images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
 }
 
+# test ODF failover/failback sequence
+declare -a test_odf_failover_failback_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'wait_before_promote' 3)
+declare -a test_odf_failover_failback_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'retry_promote' 3)
+
+test_odf_failover_failback_scenarios=2
+
+# ODF takes the following steps in failover/failback.  This test does the same.
+#Failover:
+# rbd --cluster=site-b mirror group promote test_pool/test_group --force
+# rbd --cluster=site-a mirror group demote test_pool/test_group
+# rbd --cluster=site-a mirror group resync test_pool/test_group
+#
+#Failback:
+# rbd --cluster=site-b mirror group demote test_pool/test_group
+# rbd --cluster=site-b mirror group resync test_pool/test_group
+# rbd --cluster=site-a mirror group promote test_pool/test_group
+test_odf_failover_failback()
+{
+  local primary_cluster=$1 ; shift
+  local secondary_cluster=$1 ; shift
+  local pool=$1 ; shift
+  local image_prefix=$1 ; shift
+  local scenario=$1 ; shift
+  local image_count=$(($1*"${image_multiplier}")) ; shift
+
+  local snap0='snap_0'
+  local snap1='snap_1'
+
+  # ODF has daemon running on both clusters always
+  start_mirrors "${primary_cluster}"
+
+  group_create "${primary_cluster}" "${pool}/${group0}"
+  images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" "${image_count}"
+
+  create_snapshot "${primary_cluster}" "${pool}" "${image_prefix}0" "${snap0}"
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+
+  mirror_group_enable "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}"
+  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}"
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' 
+
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+
+  # promote secondary (cluster1), demote original primary (cluster2) and request resync
+  stop_mirrors "${secondary_cluster}" '-9'
+  mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
+  start_mirrors "${secondary_cluster}"
+  mirror_group_demote "${primary_cluster}" "${pool}/${group0}"
+
+  local group_id_before group_id_after
+  get_id_from_group_info ${primary_cluster} ${pool}/${group0} group_id_before
+  mirror_group_resync "${primary_cluster}" "${pool}/${group0}" 
+
+  wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}"
+
+  get_id_from_group_info ${primary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" != "${group_id_after}" || fail "group was not recreated"
+
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+
+  write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  compare_image_with_snapshot_expect_difference "${secondary_cluster}" "${pool}/${image_prefix}0" "${primary_cluster}" "${pool}/${image_prefix}0@${snap0}"
+  mirror_group_snapshot_and_wait_for_sync_complete "${primary_cluster}" "${secondary_cluster}" "${pool}"/"${group0}"
+  compare_images "${secondary_cluster}" "${primary_cluster}" "${pool}" "${pool}" "${image_prefix}0"
+
+  write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+
+  # failback to original primary (cluster2)
+  local group_snap_id_a group_snap_id_b
+  get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_a
+  get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group0}" group_snap_id_b
+  test "${group_snap_id_a}" = "${group_snap_id_b}" || fail "group not synced"
+
+  # demote - neither site is primary
+  mirror_group_demote "${secondary_cluster}" "${pool}/${group0}"
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+unknown'
+
+  # confirm that a new snapshot was taken by the demote operation
+  local group_snap_id_c
+  get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_c
+  test "${group_snap_id_a}" != "${group_snap_id_c}" || fail "new snap not taken by demote"
+
+  local group_id_before group_id_after
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before
+  local image_id_before image_id_after
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_before
+  
+  # request resync - won't happen until other site is marked as primary
+  mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" 
+
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" = "${group_id_after}" || fail "group recreated with no primary"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" = "${image_id_after}" || fail "image recreated with no primary"
+
+  if [ "${scenario}" = 'wait_before_promote' ]; then
+    # wait for the demote snapshot to be synced before promoting the other site
+    wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}"
+
+    local group_snap_id_e group_snap_id_f
+    get_newest_group_snapshot_id "${secondary_cluster}" "${pool}"/"${group0}" group_snap_id_e
+    get_newest_group_snapshot_id "${primary_cluster}" "${pool}"/"${group0}" group_snap_id_f
+    test "${group_snap_id_c}" = "${group_snap_id_e}" || fail "new snap on original secondary"
+    test "${group_snap_id_c}" = "${group_snap_id_f}" || fail "group not synced"
+  fi
+
+  if [ "${scenario}" = 'retry_promote' ]; then
+    while true; do
+      { mirror_group_promote_try "${primary_cluster}" "${pool}/${group0}" && break; } || :
+    done
+  else
+    mirror_group_promote "${primary_cluster}" "${pool}/${group0}"
+  fi  
+
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped'
+
+  # Write some data, take a regular mirror snapshot, wait for it to sync on secondary cluster
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group0}"
+    
+  # check that group and images were deleted and recreated on secondary cluster (as a result of the resync request)
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" != "${group_id_after}" || fail "group not recreated by resync"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" != "${image_id_after}" || fail "image not recreated by resync"
+
+  group_remove "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}"
+  wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
+
+  images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  stop_mirrors "${primary_cluster}"
+  check_daemon_running "${secondary_cluster}"
+}
+
+# test ODF failover/failback sequence
+declare -a test_resync_marker_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change' 3)
+
+test_resync_marker_scenarios=1
+
+# This test does the following:
+# 1) setup a nothing-fancy group, write some data, let it sync from site-a to site-b
+# 2) demote site-a
+# 3) execute rbd mirror group resync command on site-b
+# 4) assert that nothing happens at this point (on a bad build the group on site-b would be removed after some time, note that rbd-mirror daemons must be running for that to happen)
+# 5) promote site-b
+# 6) write some more data, let it sync from site-b to site-a
+# 7) demote site-b
+# 8) promote site-a
+# 9) ensure that the group on site-b doesnt get resynced at this point 
+# 10) write some data on site-a and let it sync to site-b
+# 11) check that site-b group id has not changed again -  since just after step 5
+test_resync_marker()
+{
+  local primary_cluster=$1 ; shift
+  local secondary_cluster=$1 ; shift
+  local pool=$1 ; shift
+  local image_prefix=$1 ; shift
+  local scenario=$1 ; shift
+  local image_count=$(($1*"${image_multiplier}")) ; shift
+
+  local snap0='snap_0'
+  local snap1='snap_1'
+
+  # ODF has daemon running on both clusters always
+  start_mirrors "${primary_cluster}"
+
+  group_create "${primary_cluster}" "${pool}/${group0}"
+  images_create "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  group_images_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}" "${image_count}"
+
+  mirror_group_enable "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}"
+  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}"
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' 
+
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}"
+
+  local group_id_before group_id_after
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_before
+  local image_id_before image_id_after
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_before
+
+  # demote primary and request resync on secondary - check that group does not get deleted (due to resync request flag)
+  mirror_group_demote "${primary_cluster}" "${pool}/${group0}" 
+  mirror_group_resync "${secondary_cluster}" "${pool}/${group0}" 
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+stopped'
+
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" = "${group_id_after}" || fail "group recreated with no primary"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" = "${image_id_after}" || fail "image recreated with no primary"
+
+  # TODO next command fails (note that without the resync command above it succeeds)
+  # 2025-03-06T19:13:47.722+0000 7f655045cb40 -1 librbd::api::Mirror: group_promote: group test-group0 is still primary within a remote cluster
+  mirror_group_promote "${secondary_cluster}" "${pool}/${group0}"
+
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" = "${group_id_after}" || fail "group recreated"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" = "${image_id_after}" || fail "image recreated"
+
+  write_image "${secondary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  mirror_group_snapshot_and_wait_for_sync_complete "${primary_cluster}" "${secondary_cluster}" "${pool}"/"${group0}"
+
+  # demote - neither site is primary
+  mirror_group_demote "${secondary_cluster}" "${pool}/${group0}" 
+
+  # wait for the demote snapshot to be synced before promoting the other site
+  wait_for_group_synced "${secondary_cluster}" "${pool}"/"${group0}"
+
+  # promote original primary again
+  mirror_group_promote "${primary_cluster}" "${pool}/${group0}"
+
+  # confirm that group and image are not recreated - resync flag was cleared
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" = "${group_id_after}" || fail "group recreated"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" = "${image_id_after}" || fail "image recreated"
+
+  # write some data, take a snapshot and wait for sync to complete
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}0" 10 4096
+  mirror_group_snapshot_and_wait_for_sync_complete "${secondary_cluster}" "${primary_cluster}" "${pool}"/"${group0}"
+  
+  # check that group and image ids still not changed on secondary
+  get_id_from_group_info ${secondary_cluster} ${pool}/${group0} group_id_after
+  test "${group_id_before}" = "${group_id_after}" || fail "group recreated"
+  get_image_id2 ${secondary_cluster} ${pool}/${image_prefix}0 image_id_after
+  test "${image_id_before}" = "${image_id_after}" || fail "image recreated"
+
+  group_remove "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}"
+  wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
+
+  images_remove "${primary_cluster}" "${pool}/${image_prefix}" "${image_count}"
+  stop_mirrors "${primary_cluster}"
+  check_daemon_running "${secondary_cluster}"
+}
+
 # test resync scenarios
 declare -a test_resync_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'no_change' 3)
 
@@ -2423,7 +2808,7 @@ test_resync()
   echo "id = ${primary_group_snap_id}"
 
   # stop the daemon to prevent further syncing of snapshots
-  stop_mirrors "${secondary_cluster}"
+  stop_mirrors "${secondary_cluster}" '-9'
 
   # promote secondary and change data on image
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
@@ -2444,7 +2829,7 @@ test_resync()
   # Repeat the test this time changing the data on the primary too.
 
   # stop the daemon to prevent further syncing of snapshots
-  stop_mirrors "${secondary_cluster}"
+  stop_mirrors "${secondary_cluster}" '-9'
 
   # promote secondary and change data on image
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
@@ -2469,7 +2854,7 @@ test_resync()
   # Repeat the test this time swapping the primary and secondary and resyncing back to the new secondary.
  
   # stop the daemon to prevent further syncing of snapshots
-  stop_mirrors "${secondary_cluster}"
+  stop_mirrors "${secondary_cluster}" '-9'
 
   # promote secondary
   mirror_group_promote "${secondary_cluster}" "${pool}/${group0}" '--force'
@@ -2504,6 +2889,29 @@ test_resync()
   start_mirrors "${secondary_cluster}"
 }
 
+check_for_no_keys()
+{
+  local primary_cluster=$1
+  local secondary_cluster=$2
+  local cluster pools pool key_count obj_count
+
+  for cluster in ${primary_cluster} ${secondary_cluster}; do
+    local pools
+    pools=$(CEPH_ARGS='' ceph --cluster ${cluster} osd pool ls  | grep -v "^\." | xargs)
+
+    for pool in ${pools}; do
+      # see if the rbd_mirror_leader object exists in the pool
+      get_pool_obj_count "${cluster}" "${pool}" "rbd_mirror_leader" obj_count
+
+      # if it does then check that there are no entries left in it
+      if [ $obj_count -gt 0 ]; then
+        count_omap_keys_with_filter "${cluster}" "${pool}" "rbd_mirror_leader" "image_map" key_count
+        test "${key_count}" = 0 || fail "last test left keys" 
+      fi
+    done
+  done    
+}
+
 run_test()
 {
   local test_name=$1
@@ -2511,6 +2919,37 @@ run_test()
 
   declare -n test_parameters="$test_name"_"$test_scenario"
 
+  local primary_cluster=cluster2
+  local secondary_cluster=cluster1
+
+  # If the tmpdir and cluster conf file exist then reuse the existing cluster 
+  # but stop the daemon on the primary if it was left running by the last test
+  # and check that there are no unexpected objects left
+  if [ -d "${RBD_MIRROR_TEMDIR}" ] && [ -f "${RBD_MIRROR_TEMDIR}"'/cluster1.conf' ]
+  then
+    export RBD_MIRROR_USE_EXISTING_CLUSTER=1
+
+    # need to call this before checking the current state
+    setup_tempdir
+
+    # look at every pool on both clusters and check that there are no entries leftover in rbd_image_leader
+    check_for_no_keys "${primary_cluster}" "${secondary_cluster}"
+
+     # if the "mirror" pool doesn't exist then call setup to recreate all the required pools
+    local pool_count
+    get_pool_count "${primary_cluster}" 'mirror' pool_count
+    if [ 0 = ${pool_count} ]; then
+      setup
+    fi
+  else
+    setup  
+  fi
+
+  # stop mirror daemon if it has been left running on the primary cluster
+  stop_mirrors "${primary_cluster}"
+  # restart mirror daemon if it has been stopped on the secondary cluster
+  start_mirrors "${secondary_cluster}"
+
   testlog "TEST:$test_name scenario:$test_scenario parameters:" "${test_parameters[@]}"
   "$test_name" "${test_parameters[@]}"
 }
@@ -2553,17 +2992,17 @@ run_all_tests()
   run_test_all_scenarios test_force_promote
   run_test_all_scenarios test_resync
   run_test_all_scenarios test_remote_namespace
-  run_test_all_scenarios test_multiple_user_snapshot_whilst_stopped
+  run_test_all_scenarios test_multiple_mirror_group_snapshot_whilst_stopped
   run_test_all_scenarios test_create_group_with_image_remove_then_repeat
+  run_test_all_scenarios test_enable_disable_repeat
   run_test_all_scenarios test_empty_group_omap_keys
   #run_test_all_scenarios test_group_with_clone_image
-  run_test_all_scenarios test_multiple_user_snapshot_time
+  run_test_all_scenarios test_multiple_mirror_group_snapshot_unlink_time
   run_test_all_scenarios test_force_promote_delete_group
   run_test_all_scenarios test_create_group_stop_daemon_then_recreate
-  #run_test_all_scenarios test_enable_mirroring_when_duplicate_group_exists
-
-  #FIXME: This test leaves residual groups on secondary moving it to the end.
-  run_test_all_scenarios test_enable_disable_repeat
+  run_test_all_scenarios test_enable_mirroring_when_duplicate_group_exists
+  run_test_all_scenarios test_odf_failover_failback
+  #run_test_all_scenarios test_resync_marker
 }
 
 if [ -n "${RBD_MIRROR_SHOW_CLI_CMD}" ]; then
@@ -2572,22 +3011,6 @@ else
   set -ex
 fi  
 
-# If the tmpdir and cluster conf file exist then reuse the existing cluster
-if [ -d "${RBD_MIRROR_TEMDIR}" ] && [ -f "${RBD_MIRROR_TEMDIR}"'/cluster1.conf' ]
-then
-  export RBD_MIRROR_USE_EXISTING_CLUSTER=1
-fi
-
-setup
-
-# see if we need to (re)start rbd-mirror deamon
-pid=$(cat "$(daemon_pid_file "${CLUSTER1}")" 2>/dev/null) || :
-if [ -z "${pid}" ]
-then
-    start_mirrors "${CLUSTER1}"
-fi
-check_daemon_running "${CLUSTER1}"
-
 # restore the arguments from the cli
 set -- "${args[@]}"
 
diff --git a/qa/workunits/rbd/rbd_mirror_helpers.sh b/qa/workunits/rbd/rbd_mirror_helpers.sh
index 211696a3c055f..7da7a72d68093 100755
--- a/qa/workunits/rbd/rbd_mirror_helpers.sh
+++ b/qa/workunits/rbd/rbd_mirror_helpers.sh
@@ -187,6 +187,8 @@ run_cmd_internal() {
         export CEPH_ARGS="--id ${CEPH_ID}"
     fi
 
+    echo "${cmd}" >> "${TEMPDIR}/rbd-mirror.cmd.log"
+
     # Don't exit immediately if the command exits with a non-zero status.
     set +e
     $cmd >"${CMD_STDOUT}" 2>"${CMD_STDERR}"
@@ -288,6 +290,11 @@ daemon_pid_file()
     echo $(ceph-conf --cluster $cluster --name "client.${MIRROR_USER_ID_PREFIX}${instance}" 'pid file')
 }
 
+echo_red()
+{
+    echo -e "${RED}$@${NO_COLOUR}"
+}
+
 testlog()
 {
     echo -e "${RED}"$(date '+%F %T') $@ "${NO_COLOUR}"| tee -a "${TEMPDIR}/rbd-mirror.test.log" >&2
@@ -489,8 +496,8 @@ setup()
     if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
         setup_cluster "${CLUSTER1}"
         setup_cluster "${CLUSTER2}"
+        setup_dummy_objects "${CLUSTER1}"
     fi
-    setup_dummy_objects "${CLUSTER1}"
     setup_pools "${CLUSTER1}" "${CLUSTER2}"
     setup_pools "${CLUSTER2}" "${CLUSTER1}"
 
@@ -957,7 +964,7 @@ get_newest_mirror_snapshot()
     local log=$4
 
     rbd --cluster "${cluster}" snap list --all "${pool}/${image}" --format xml | \
-        $XMLSTARLET sel -t -c "//snapshots/snapshot[namespace/complete='true' and position()=last()]" > \
+        $XMLSTARLET sel -t -c "(//snapshots/snapshot[namespace/complete='true'])[last()]" > \
         ${log} || true
 }
 
@@ -1089,6 +1096,18 @@ test_fields_in_group_info()
     test "${fields_arr[5]}" = "${expected_is_primary}" || { fail "primary = ${fields_arr[5]}"; return 1; }
 }
 
+get_id_from_group_info()
+{
+    local cluster=$1 ; shift
+    local group_spec=$1 ; shift
+    local -n _result=$1 ; shift
+
+    local fields=(//group/group_id)
+    local fields_arr
+    get_fields_from_group_info "${cluster}" "${group_spec}" fields_arr "${fields[@]}"
+    _result="${fields_arr[0]}"
+}
+
 get_fields_from_mirror_image_status()
 {
     local cluster=$1 ; shift
@@ -1472,7 +1491,7 @@ create_snapshot()
     local image=$3
     local snap=$4
 
-    rbd --cluster ${cluster} snap create ${pool}/${image}@${snap}
+    run_cmd "rbd --cluster ${cluster} snap create ${pool}/${image}@${snap}"
 }
 
 remove_snapshot()
@@ -1916,6 +1935,16 @@ get_image_id()
         sed -ne 's/^.*block_name_prefix: rbd_data\.//p'
 }
 
+get_image_id2()
+{
+    local cluster=$1
+    local image_spec=$2
+    local -n _id=$3
+
+    run_cmd "rbd --cluster ${cluster} info ${image_spec} --format xml --pretty-format"
+    _id=$($XMLSTARLET sel -t -v "//image/id" "$CMD_STDOUT") || { fail "no id!"; return; }
+}
+
 get_image_mirroring_global_id()
 {
     local cluster=$1
@@ -2192,7 +2221,12 @@ mirror_group_disable()
     local cluster=$1 ; shift
     local group_spec=$1 ; shift
 
-    run_cmd "rbd --cluster=${cluster} mirror group disable $* ${group_spec}"
+    local force
+    if [ -n "$1" ]; then
+        force=$1; shift
+    fi
+
+    run_cmd "rbd --cluster=${cluster} mirror group disable $* ${group_spec} ${force}"
 }
 
 create_group_and_enable_mirror()
@@ -2218,8 +2252,16 @@ mirror_group_promote()
     local cluster=$1
     local group_spec=$2
     local force=$3
+    local runner=${4:-"run_cmd"}
+
+    "$runner" "rbd --cluster=${cluster} mirror group promote ${group_spec} ${force}"
+}
+
+mirror_group_promote_try()
+{
+    local force=${3:-''}
 
-    run_cmd "rbd --cluster=${cluster} mirror group promote ${group_spec} ${force}"
+    mirror_group_promote "$@" "${force}" "try_cmd"
 }
 
 mirror_group_snapshot()
@@ -2280,6 +2322,31 @@ get_group_snap_name()
     _group_snap_name="$($XMLSTARLET sel -t -v "//group_snaps/group_snap[id='${snap_id}']/snapshot" < "$CMD_STDOUT")"
 }
 
+get_pool_count()
+{
+    local cluster=$1
+    local pool_name=$2
+    local -n _count=$3
+
+    run_cmd "ceph --cluster ${cluster} osd pool ls --format xml-pretty"
+    if [ "${pool_name}" = '*' ]; then
+        _count="$($XMLSTARLET sel -t -v "count(//pools/pool_name)" < "$CMD_STDOUT")"
+    else
+        _count="$($XMLSTARLET sel -t -v "count(//pools[pool_name='${pool_name}'])" < "$CMD_STDOUT")"
+    fi
+}
+
+get_pool_obj_count()
+{
+    local cluster=$1
+    local pool=$2
+    local obj_name=$3
+    local -n _count=$4
+
+    run_cmd "rados --cluster ${cluster} -p ${pool} ls --format xml-pretty"
+    _count="$($XMLSTARLET sel -t -v "count(//objects/object[name='${obj_name}'])" < "$CMD_STDOUT")"
+}
+
 get_image_snap_id_from_group_snap_info()
 {
     local cluster=$1
@@ -2633,13 +2700,12 @@ get_newest_group_snapshot_id()
     # TODO - have seen this next cmd fail with rc=2 and an empty list
     # this should not happen, but if it does then retry as a temp workaround
     try_cmd "rbd --cluster ${cluster} group snap list ${group_spec} --format xml --pretty-format" &&
-      { _group_snap_id=$(xmlstarlet sel -t -v "//group_snaps/group_snap[state='complete' and position()=last()]/id" "$CMD_STDOUT" ); return; }
-
+      { _group_snap_id=$(xmlstarlet sel -t -v "(//group_snaps/group_snap[state='complete']/id)[last()]" "$CMD_STDOUT" ); return; }
     for s in 0.1 1 2 4 8 8 8 8 8 8 8 8 16 16; do
         echo -e "${RED}RETRYING COMMAND${NO_COLOUR}";
         sleep ${s}
         try_cmd "rbd --cluster ${cluster} group snap list ${group_spec} --format xml --pretty-format" && {
-          _group_snap_id=$(xmlstarlet sel -t -v "//group_snaps/group_snap[state='complete' and position()=last()]/id" "$CMD_STDOUT" ); return; }
+          _group_snap_id=$(xmlstarlet sel -t -v "(//group_snaps/group_snap[state='complete']/id)[last()]" "$CMD_STDOUT" ); return; }
     done
     fail "Failed to execute command"
     return 1
@@ -2767,52 +2833,39 @@ wait_for_group_status_in_pool_dir()
     return 1
 }
 
-# useful function that attempts to delete all groups and images
-tidy()
+stop_daemons_on_clusters()
 {
-    local primary_cluster=cluster2
-    local secondary_cluster=cluster1
-    local cluster pool group group_spec image image_spec
+    local cluster_list=$1
+    local cluster
 
-    for cluster in ${primary_cluster} ${secondary_cluster}; do
+    for cluster in ${cluster_list}; do
         echo 'cluster:'${cluster}
         stop_mirrors ${cluster} '-9'
     done
+}
 
-    for cluster in ${primary_cluster} ${secondary_cluster}; do
+delete_pools_on_clusters()
+{
+    local cluster_list=$1
+    local cluster
+
+    for cluster in ${cluster_list}; do
         echo 'cluster:'${cluster}
         for pool in $(CEPH_ARGS='' ceph --cluster ${cluster} osd pool ls  | grep -v "^\." | xargs); do
             echo 'pool:'${pool}
              run_admin_cmd "ceph --cluster ${cluster} osd pool delete ${pool} ${pool} --yes-i-really-really-mean-it"
         done
     done        
+}
 
-    # following is old method that used to remove individual object rather than removing entire pools
-    : '
-    for cluster in ${primary_cluster} ${secondary_cluster}; do
-        echo 'cluster:'${cluster}
-        for pool in "${POOL}" "${PARENT_POOL}" "${POOL}/${NS1}" "${POOL}/${NS2}"; do
-            echo 'pool:'${pool}
-            for group in $(rbd --cluster ${cluster} group list ${pool} | xargs); do
-                group_spec=${pool}/${group}
-                echo 'group_spec:'${group_spec}
-                mirror_group_disable ${cluster} ${group_spec} '--force'
-                for image_spec in $(rbd --cluster ${cluster} group image list ${group_spec} | xargs); do
-                    echo 'image_spec:'"${image_spec}"
-                    mirror_image_disable ${cluster} ${image_spec} '--force'
-                    group_image_remove ${cluster} ${group_spec} ${image_spec}
-                done
-                group_remove ${cluster} ${group_spec}
-            done
-            for image in $(rbd --cluster ${cluster} list ${pool} | xargs); do
-                image_spec=${pool}/${image}
-                echo 'image_spec:'"${image_spec}"
-                mirror_image_disable ${cluster} ${image_spec} '--force'
-                image_remove ${cluster} ${image_spec}
-            done
-        done
-    done
-    '
+# stops all daemons and deletes all pools (groups and images included)
+tidy()
+{
+    local primary_cluster=cluster2
+    local secondary_cluster=cluster1
+
+    stop_daemons_on_clusters "${primary_cluster} ${secondary_cluster}"
+    delete_pools_on_clusters "${primary_cluster} ${secondary_cluster}"
 }
 
 # list all groups, images and snaps
-- 
2.39.5