From: Ramana Raja Date: Mon, 29 Dec 2025 22:17:28 +0000 (-0500) Subject: mgr/rbd_support: Stagger mirror snapshot and trash purge schedules X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9c412c763352a76d83c117971548ec97b6974a32;p=ceph-ci.git mgr/rbd_support: Stagger mirror snapshot and trash purge schedules Previously, multiple images or namespaces scheduled with the same interval ran mirror snapshots or trash purges at around the same time, creating spikes in cluster activity. This change staggers scheduled jobs by: - Adding a deterministic phase offset per image or namespace when no start-time is set. - Picking a random element from the queue at each scheduled time, rather than always the first. Together, these changes spread snapshot and trash purge operations more evenly over time and improve cluster stability. Fixes: https://tracker.ceph.com/issues/74288 Signed-off-by: Ramana Raja --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index e14ae6e48bd..0f366f1d001 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -50,6 +50,9 @@ synchronized using full copy, as blockdiff is not efficient for small files. The threshold is controlled by the new configuration option cephfs_mirror_blockdiff_min_file_size (default: 16_M). For more information, see https://tracker.ceph.com/issues/73452 +* RBD: Mirror snapshot creation and trash purge schedules are now automatically + staggered when no explicit "start-time" is specified. This reduces + synchronized scheduling spikes and distributes work more evenly over time. >=20.0.0 diff --git a/doc/rbd/rbd-mirroring.rst b/doc/rbd/rbd-mirroring.rst index 7602191ca81..aaa28ac63d8 100644 --- a/doc/rbd/rbd-mirroring.rst +++ b/doc/rbd/rbd-mirroring.rst @@ -421,6 +421,10 @@ globally, per-pool, or per-image levels. Multiple mirror-snapshot schedules can be defined at any level, but only the most-specific snapshot schedules that match an individual mirrored image will run. +When multiple images share the same schedule interval and no explicit +``start-time`` is defined, snapshot execution is automatically staggered across +the interval to reduce scheduling spikes. + To create a mirror-snapshot schedule with ``rbd``, specify the ``mirror snapshot schedule add`` command along with an optional pool or image name; interval; and optional start time:: diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index 8cef48b238c..d59fa4e9b62 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -1336,6 +1336,163 @@ test_trash_purge_schedule_recovery() { } +test_trash_purge_schedule_staggering() { + echo "Testing trash purge schedule staggering..." + remove_images + ceph osd pool create rbd2 8 + rbd pool init rbd2 + + # Initial empty check + test "$(ceph rbd trash purge schedule list)" = "{}" + ceph rbd trash purge schedule status | fgrep '"scheduled": []' + + # Create 50 namespaces + for i in {1..50}; do + rbd namespace create "rbd2/test$i" + done + + # Helper to get status JSON and verify all namespaces are scheduled + get_trash_purge_schedule_status() { + local num_scheduled=$1 + local -n status_ref=$2 + + local list_json + list_json=$(rbd trash purge schedule ls -p rbd2 -R --format json) + + local list_namespaces=() + mapfile -t list_namespaces < <( + jq -r 'sort_by(.namespace) | .[].namespace' <<< "$list_json" + ) + [ "${#list_namespaces[@]}" -eq "$num_scheduled" ] || return 1 + + # Poll status until it has all scheduled namespaces + for ((j=0; j<12; j++)); do + status_ref=$(rbd trash purge schedule status -p rbd2 --format json) + [ "$(jq 'length' <<< "$status_ref")" -eq "${#list_namespaces[@]}" ] && break + sleep 10 + done + + local status_namespaces=() + mapfile -t status_namespaces < <( + jq -r 'sort_by(.namespace) | .[].namespace' <<< "$status_ref" + ) + for i in "${!list_namespaces[@]}"; do + [[ "${list_namespaces[i]}" != "${status_namespaces[i]}" ]] && return 1; + done + return 0 + } + + # Helper to check staggering of schedules + are_trash_purge_schedules_staggered() { + local status_json=$1 + local interval_min=$2 + local unique_times=() + mapfile -t unique_times < <(jq -r '.[].schedule_time' <<< "$status_json" | sort -u) + + # Expect one unique time slot per interval minute (1-minute scheduler granularity) + [ "${#unique_times[@]}" -eq "$interval_min" ] || return 1 + + # Check that consecutive schedule times are exactly 1 minute apart + local prev_epoch=$(( $(date -d "${unique_times[0]}" +%s)/60 )) + for ((i=1;i<${#unique_times[@]};i++)); do + local curr=$(( $(date -d "${unique_times[i]}" +%s)/60 )) + [ $((curr - prev_epoch)) -eq 1 ] || return 1 + prev_epoch=$curr + done + return 0 + } + + # Verify that `schedule add/rm` maintains proper staggering + local interval_min=5 + local status_json + local num_scheduled_namespaces=40 + # Schedule namespaces test1..test40 + for ((i=1; i<=40; i++)); do + rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" + done + get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json + are_trash_purge_schedules_staggered "$status_json" "$interval_min" + + # Shift scheduling range to test6..test45 + for ((i=41; i<=45; i++)); do + rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" + done + for ((i=1; i<=5; i++)); do + rbd trash purge schedule rm -p "rbd2/test$i" + done + get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json + are_trash_purge_schedules_staggered "$status_json" "$interval_min" + + # Shift scheduling range to test11..test50 + for ((i=46; i<=50; i++)); do + rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" + done + for ((i=6; i<=10; i++)); do + rbd trash purge schedule rm -p "rbd2/test$i" + done + get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json + are_trash_purge_schedules_staggered "$status_json" "$interval_min" + + # Add schedules for test1..test10 with explicit start time. + # These should all share the same next schedule_time. + num_scheduled_namespaces=50 + for ((i=1; i<=10; i++)); do + rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" 2020-01-01 + done + + # Get updated status + get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json + + # Split status into two sets: + # test1..test10 (explicit start-time) + # test11..test50 (should remain staggered) + local anchored_times=() + local staggered_json + + # Extract schedule times for namespaces test1..test10 + mapfile -t anchored_times < <( + jq -r '.[] + | select(.namespace | test("^test([1-9]|10)$")) + | .schedule_time' <<< "$status_json" | sort -u + ) + + # All anchored schedules should share exactly one schedule_time + [ "${#anchored_times[@]}" -eq 1 ] || return 1 + + # Extract JSON only for namespaces test11..test50 + staggered_json=$(jq ' + map(select(.namespace | test("^test(1[1-9]|[2-4][0-9]|50)$"))) + ' <<< "$status_json") + + # Verify these remain properly staggered + are_trash_purge_schedules_staggered "$staggered_json" "$interval_min" + + # Cleanup: remove all schedules + for ((i=1; i<=50; i++)); do + rbd trash purge schedule rm -p "rbd2/test$i" + done + + # Wait until schedule status becomes empty + local empty_status + for ((j=0; j<12; j++)); do + empty_status=$(rbd trash purge schedule status -p rbd2 --format json) + [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] && break + sleep 5 + done + + [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] || { + echo "Error: trash purge schedule status not empty after removals" + return 1 + } + + # Remove namespaces + for ((i=1; i<=50; i++)); do + rbd namespace rm "rbd2/test$i" + done + + ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it +} + test_mirror_snapshot_schedule() { echo "testing mirror snapshot schedule..." remove_images @@ -1522,6 +1679,166 @@ test_mirror_snapshot_schedule_recovery() { ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it } +test_mirror_snapshot_schedule_staggering() { + echo "Testing mirror snapshot schedule staggering..." + + remove_images + ceph osd pool create rbd2 8 + rbd pool init rbd2 + rbd mirror pool enable rbd2 image + rbd mirror pool peer add rbd2 cluster1 + + # Initial empty check + test "$(ceph rbd mirror snapshot schedule list)" = "{}" + ceph rbd mirror snapshot schedule status | fgrep '"scheduled_images": []' + + # Create 50 images + for i in {1..50}; do + rbd create $RBD_CREATE_ARGS -s 1 "rbd2/test$i" + rbd mirror image enable "rbd2/test$i" snapshot + done + + # Helper to get status JSON and verify all images are scheduled + get_mirror_snapshot_schedule_status() { + local num_scheduled=$1 + local -n status_ref=$2 + + local list_json + list_json=$(rbd mirror snapshot schedule ls -p rbd2 -R --format json) + + local list_images=() + mapfile -t list_images < <( + jq -r 'sort_by(.image) | .[].image' <<< "$list_json" + ) + [ "${#list_images[@]}" -eq "$num_scheduled" ] || return 1 + + # Poll status until it has all scheduled images + for ((j=0;j<12;j++)); do + status_ref=$(rbd mirror snapshot schedule status -p rbd2 --format json) + [ "$(jq 'length' <<< "$status_ref")" -eq "${#list_images[@]}" ] && break + sleep 10 + done + + local status_images=() + mapfile -t status_images < <( + jq -r 'sort_by(.image) | .[].image | split("/")[-1]' <<< "$status_ref" + ) + for i in "${!list_images[@]}"; do + [[ "${list_images[i]}" != "${status_images[i]}" ]] && return 1; + done + return 0 + } + + # Helper to check staggering of schedules + are_mirror_snapshot_schedules_staggered() { + local status_json=$1 interval_min=$2 + local unique_times=() + mapfile -t unique_times < <( + jq -r '.[].schedule_time' <<< "$status_json" | sort -u + ) + # Expect one unique time slot per interval minute (1-minute scheduler granularity) + [ "${#unique_times[@]}" -eq "$interval_min" ] || return 1 + + # Check that consecutive schedule times are exactly 1 minute apart + local prev_epoch=$(( $(date -d "${unique_times[0]}" +%s)/60 )) + for ((i=1;i<${#unique_times[@]};i++)); do + local curr=$(( $(date -d "${unique_times[i]}" +%s)/60 )) + [ $((curr - prev_epoch)) -eq 1 ] || return 1 + prev_epoch=$curr + done + return 0 + } + + # Verify that `schedule add/rm` maintains proper staggering + local interval_min=5 + local status_json + local num_scheduled_images=40 + # Schedule images test1..test40 + for ((i=1; i<=40; i++)); do + rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" + done + get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json + are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min" + + # Shift scheduling range to test6..test45 + for ((i=41; i<=45; i++)); do + rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" + done + for ((i=1; i<=5; i++)); do + rbd mirror snapshot schedule rm -p rbd2 --image "test$i" + done + get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json + are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min" + + # Shift scheduling range to test11..test50 + for ((i=46; i<=50; i++)); do + rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" + done + for ((i=6; i<=10; i++)); do + rbd mirror snapshot schedule rm -p rbd2 --image "test$i" + done + get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json + are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min" + + # Add schedules for test1..test10 with explicit start time. + # These should all share the same next schedule_time. + num_scheduled_images=50 + for ((i=1; i<=10; i++)); do + rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" 2020-01-01 + done + + # Get updated status + get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json + + # Split status into two sets: + # test1..test10 (explicit start-time) + # test11..test50 (should remain staggered) + local anchored_times=() + local staggered_json + + # Extract schedule times for images test1..test10 + mapfile -t anchored_times < <( + jq -r '.[] + | select(.image | test("^rbd2/test([1-9]|10)$")) + | .schedule_time' <<< "$status_json" | sort -u + ) + + # All anchored schedules should share exactly one schedule_time + [ "${#anchored_times[@]}" -eq 1 ] || return 1 + + # Extract JSON only for images test11..test50 + staggered_json=$(jq ' + map(select(.image | test("^rbd2/test(1[1-9]|[2-4][0-9]|50)$"))) + ' <<< "$status_json") + + # Verify these remain properly staggered + are_mirror_snapshot_schedules_staggered "$staggered_json" "$interval_min" + + # Cleanup: remove all schedules + for ((i=1; i<=50; i++)); do + rbd mirror snapshot schedule rm -p rbd2 --image "test$i" + done + + # Wait until schedule status becomes empty + local empty_status + for ((j=0; j<12; j++)); do + empty_status=$(rbd mirror snapshot schedule status -p rbd2 --format json) + [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] && break + sleep 5 + done + [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] || { + echo "Error: snapshot schedule status not empty after removals" + return 1 + } + + # Remove images + for ((i=1; i<=50; i++)); do + rbd rm "rbd2/test$i" + done + + ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it +} + test_perf_image_iostat() { echo "testing perf image iostat..." remove_images @@ -1780,8 +2097,10 @@ test_thick_provision test_namespace test_trash_purge_schedule test_trash_purge_schedule_recovery +test_trash_purge_schedule_staggering test_mirror_snapshot_schedule test_mirror_snapshot_schedule_recovery +test_mirror_snapshot_schedule_staggering test_perf_image_iostat test_perf_image_iostat_recovery test_mirror_pool_peer_bootstrap_create diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py index 02e2b7882eb..1b91a162546 100644 --- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py +++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py @@ -1,6 +1,7 @@ import errno import json import rados +import random import rbd import traceback @@ -519,7 +520,7 @@ class MirrorSnapshotScheduleHandler: pool_id, namespace, image_id)) return - schedule_time = schedule.next_run(now) + schedule_time = schedule.next_run(now, image_id) if schedule_time not in self.queue: self.queue[schedule_time] = [] self.log.debug( @@ -540,7 +541,8 @@ class MirrorSnapshotScheduleHandler: return None, (schedule_time - now).total_seconds() images = self.queue[schedule_time] - image = images.pop(0) + rng = random.Random(schedule_time.timestamp()) + image = images.pop(rng.randrange(len(images))) if not images: del self.queue[schedule_time] return image, 0.0 diff --git a/src/pybind/mgr/rbd_support/schedule.py b/src/pybind/mgr/rbd_support/schedule.py index 173ef7e6d5e..8866364516c 100644 --- a/src/pybind/mgr/rbd_support/schedule.py +++ b/src/pybind/mgr/rbd_support/schedule.py @@ -1,3 +1,4 @@ +import hashlib import json import rados import rbd @@ -329,12 +330,26 @@ class Schedule: start_time: Optional[StartTime] = None) -> None: self.items.discard((interval, start_time)) - def next_run(self, now: datetime) -> datetime: + @staticmethod + def _compute_phase_offset_minutes(entity_id: str, period_minutes: int) -> int: + key = entity_id + "|" + str(period_minutes) + h = hashlib.md5(key.encode("utf-8")).hexdigest() + val = int(h, 16) + return (val % period_minutes) + + def next_run(self, now: datetime, entity_id: str) -> datetime: schedule_time = None for interval, start_time in self.items: period = timedelta(minutes=interval.minutes) - anchor_time = start_time.dt if start_time else datetime(1970, 1, 1, tzinfo=timezone.utc) + if start_time: + anchor_time = start_time.dt + else: + phase_offset_minutes = self._compute_phase_offset_minutes(entity_id, interval.minutes) + anchor_time = ( + datetime(1970, 1, 1, tzinfo=timezone.utc) + + timedelta(minutes=phase_offset_minutes) + ) if anchor_time > now: candidate_time = anchor_time diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py index b9774d18e3d..9284a41480e 100644 --- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py +++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py @@ -1,5 +1,6 @@ import json import rados +import random import rbd import traceback @@ -177,7 +178,8 @@ class TrashPurgeScheduleHandler: pool_id, namespace)) return - schedule_time = schedule.next_run(now) + schedule_time = schedule.next_run(now, + "{}/{}".format(pool_id, namespace)) if schedule_time not in self.queue: self.queue[schedule_time] = [] self.log.debug( @@ -198,7 +200,8 @@ class TrashPurgeScheduleHandler: return None, (schedule_time - now).total_seconds() namespaces = self.queue[schedule_time] - namespace = namespaces.pop(0) + rng = random.Random(schedule_time.timestamp()) + namespace = namespaces.pop(rng.randrange(len(namespaces))) if not namespaces: del self.queue[schedule_time] return namespace, 0.0