]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
rbd-mirror: allow resync while a group snapshot is still syncing
authorPrasanna Kumar Kalever <prasanna.kalever@redhat.com>
Thu, 11 Dec 2025 05:23:50 +0000 (10:53 +0530)
committerPrasanna Kumar Kalever <prasanna.kalever@redhat.com>
Thu, 19 Feb 2026 08:35:24 +0000 (14:05 +0530)
currently we do not allow resync operation if the snapshot is still inprogress
to sync until its fully done. This means that if snapshot synchronization
becomes stuck for any reason, a resync cannot be triggered, resulting in an
undesirable operational limitation.

this change enables resync requests to be processed even when a group snapshot
is still syncing, allowing resync in the middle of syncing a group snapshot.

Signed-off-by: Prasanna Kumar Kalever <prasanna.kalever@redhat.com>
qa/workunits/rbd/rbd_mirror_group_simple.sh

index 31389a585058599a4a0cb7348131418c7b6489ef..cdc1d366524e3b6bca76e20647a79aabbd2a24b4 100755 (executable)
@@ -3161,6 +3161,96 @@ test_interrupted_sync()
   image_remove "${primary_cluster}" "${pool}/${big_image}"
 }
 
+# Scenario 1: The snapshot on the secondary is in the creating phase when the daemon is restarted then resync is flagged.
+declare -a test_interrupted_sync_and_resync_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'resync_when_snap_creating' 2)
+# Scenario 2: The snapshot on the secondary is in the created phase when the daemon is restarted then resync is flagged.
+declare -a test_interrupted_sync_and_resync_2=("${CLUSTER2}" "${CLUSTER1}" "${pool0}" "${image_prefix}" 'reysnc_when_snap_created' 2)
+
+test_interrupted_sync_and_resync_scenarios=2
+
+test_interrupted_sync_and_resync()
+{
+  local primary_cluster=$1 ; shift
+  local secondary_cluster=$1 ; shift
+  local pool=$1 ; shift
+  local image_prefix=$1 ; shift
+  local scenario=$1 ; shift
+  local image_count=$(($1*"${image_multiplier}")) ; shift
+  local group0=test-group0
+  local snap0='snap_0'
+
+  start_mirrors "${primary_cluster}"
+  start_mirrors "${secondary_cluster}"
+
+  group_create "${primary_cluster}" "${pool}/${group0}"
+  image_create "${primary_cluster}" "${pool}/${image_prefix}1" 1G
+  write_image "${primary_cluster}" "${pool}" "${image_prefix}1" 10 4096
+
+  big_image=test-image-big
+  image_create "${primary_cluster}" "${pool}/${big_image}" 4G
+  group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${image_prefix}1"
+  group_image_add "${primary_cluster}" "${pool}/${group0}" "${pool}/${big_image}"
+
+  create_snapshot "${primary_cluster}" "${pool}" "${image_prefix}1" "${snap0}"
+  compare_image_with_snapshot "${primary_cluster}" "${pool}/${image_prefix}1" "${primary_cluster}" "${pool}/${image_prefix}1@${snap0}"
+
+  mirror_group_enable "${primary_cluster}" "${pool}/${group0}"
+  wait_for_group_present "${secondary_cluster}" "${pool}" "${group0}" "${image_count}"
+  wait_for_group_replay_started "${secondary_cluster}" "${pool}"/"${group0}" "${image_count}"
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' "${image_count}"
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" "${secondary_cluster}" "${pool}"/"${group0}"
+
+  write_image "${primary_cluster}" "${pool}" "${big_image}" 1024 4194304
+
+  local group_snap_id
+  mirror_group_snapshot "${primary_cluster}" "${pool}/${group0}" group_snap_id
+
+  local image_snap_id
+  wait_for_image_snapshot_with_group_snap_info "${secondary_cluster}" "${pool}" "${image_prefix}1" "${group_snap_id}" image_snap_id
+  if [ "${scenario}" = 'resync_when_snap_creating' ]; then
+    stop_mirror_while_group_snapshot_incomplete "${secondary_cluster}" "${pool}" "${group0}" "${group_snap_id}" "creating"
+    test_group_snap_state "${secondary_cluster}" "${pool}" "${group0}" "${group_snap_id}" "creating"
+  elif [ "${scenario}" = 'reysnc_when_snap_created' ]; then
+    stop_mirror_while_group_snapshot_incomplete "${secondary_cluster}" "${pool}" "${group0}" "${group_snap_id}" "created"
+    test_group_snap_state "${secondary_cluster}" "${pool}" "${group0}" "${group_snap_id}" "created"
+    test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}"
+  fi
+
+  local group_id_before
+  get_id_from_group_info "${secondary_cluster}" "${pool}/${group0}" group_id_before
+
+  # Flag the resync
+  mirror_group_resync "${secondary_cluster}" "${pool}"/"${group0}"
+
+  # Start the mirror daemon
+  start_mirrors "${secondary_cluster}"
+  test_group_snap_sync_incomplete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}"
+
+  # Notice that the group will be resynced immediately, without having to wait
+  # for the snapshot to reach the CREATED state
+  wait_for_group_id_changed  "${secondary_cluster}" "${pool}/${group0}" "${group_id_before}"
+
+  # confirm that data on secondary again matches initial snapshot on primary
+  wait_for_group_synced "${primary_cluster}" "${pool}"/"${group0}" "${secondary_cluster}" "${pool}"/"${group0}"
+  test_group_snap_sync_complete "${secondary_cluster}" "${pool}/${group0}" "${group_snap_id}"
+
+  compare_image_with_snapshot "${secondary_cluster}" "${pool}/${image_prefix}1" "${primary_cluster}" "${pool}/${image_prefix}1@${snap0}"
+
+  wait_for_group_status_in_pool_dir "${primary_cluster}" "${pool}"/"${group0}" 'up+stopped' ${image_count}
+  wait_for_group_status_in_pool_dir "${secondary_cluster}" "${pool}"/"${group0}" 'up+replaying' "${image_count}"
+
+  # tidy up
+  mirror_group_disable "${primary_cluster}" "${pool}/${group0}"
+  group_remove "${primary_cluster}" "${pool}/${group0}"
+
+  wait_for_group_not_present "${primary_cluster}" "${pool}" "${group0}"
+  wait_for_group_not_present "${secondary_cluster}" "${pool}" "${group0}"
+
+  image_remove "${primary_cluster}" "${pool}/${image_prefix}1"
+  image_remove "${primary_cluster}" "${pool}/${big_image}"
+}
+
 # test force unlink time
 declare -a test_multiple_mirror_group_snapshot_unlink_time_1=("${CLUSTER2}" "${CLUSTER1}" "${pool0}")
 
@@ -3977,6 +4067,7 @@ run_all_tests()
   run_test_all_scenarios test_group_with_clone_image
   run_test_all_scenarios test_interrupted_sync_restarted_daemon
   run_test_all_scenarios test_interrupted_sync
+  run_test_all_scenarios test_interrupted_sync_and_resync
   run_test_all_scenarios test_resync_after_relocate_and_force_promote
   run_test_all_scenarios test_multiple_mirror_group_snapshot_unlink_time
   run_test_all_scenarios test_force_promote_delete_group