From: Ramana Raja <rraja@redhat.com>
Date: Mon, 29 Dec 2025 22:17:28 +0000 (-0500)
Subject: mgr/rbd_support: Stagger mirror snapshot and trash purge schedules
X-Git-Tag: v21.0.0~149^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F66758%2Fhead;p=ceph.git

mgr/rbd_support: Stagger mirror snapshot and trash purge schedules

Previously, multiple images or namespaces scheduled with the same
interval ran mirror snapshots or trash purges at around the same time,
creating spikes in cluster activity.

This change staggers scheduled jobs by:

- Adding a deterministic phase offset per image or namespace when no
  start-time is set.
- Picking a random element from the queue at each scheduled time, rather
  than always the first.

Together, these changes spread snapshot and trash purge operations more
evenly over time and improve cluster stability.

Fixes: https://tracker.ceph.com/issues/74288
Signed-off-by: Ramana Raja <rraja@redhat.com>
---

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 6a6357285f0..780cb07daaf 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -54,6 +54,9 @@
   ``ceph fs snapshot mirror daemon status`` now shows the remote cluster's
   monitor addresses and cluster ID for each configured peer, making it easier
   to verify peer connectivity and troubleshoot mirroring issues.
+* RBD: Mirror snapshot creation and trash purge schedules are now automatically
+  staggered when no explicit "start-time" is specified. This reduces scheduling
+  spikes and distributes work more evenly over time.
 
 >=20.0.0
 
diff --git a/doc/rbd/rbd-mirroring.rst b/doc/rbd/rbd-mirroring.rst
index 7602191ca81..5357ce21554 100644
--- a/doc/rbd/rbd-mirroring.rst
+++ b/doc/rbd/rbd-mirroring.rst
@@ -421,6 +421,10 @@ globally, per-pool, or per-image levels. Multiple mirror-snapshot schedules can
 be defined at any level, but only the most-specific snapshot schedules that
 match an individual mirrored image will run.
 
+When multiple images share the same schedule interval and no explicit
+``start-time`` is defined, snapshot creation is automatically staggered across
+the interval to reduce scheduling spikes.
+
 To create a mirror-snapshot schedule with ``rbd``, specify the
 ``mirror snapshot schedule add`` command along with an optional pool or
 image name; interval; and optional start time::
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh
index 8cef48b238c..36fb254eb37 100755
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -25,6 +25,35 @@ remove_images() {
     done
 }
 
+# Verifies that the provided schedule status JSON represents a properly
+# staggered schedule for the given interval.
+are_schedules_staggered() {
+    # $1: Status JSON output of a scheduler in the rbd_support mgr module
+    #     (e.g. `rbd trash purge schedule status --format json`)
+    local status_json=$1
+    # $2: Schedule interval in minutes
+    local interval_min=$2
+
+    local unique_times=()
+    mapfile -t unique_times < <(jq -r '.[].schedule_time' <<< "$status_json" | sort -u)
+
+    # Expect one unique time slot per interval minute (1-minute scheduler granularity).
+    # Allow one extra time slot in case status is observed during cycle rollover
+    (( ${#unique_times[@]} == interval_min ||
+       ${#unique_times[@]} == interval_min + 1 )) || return 1
+
+    # Check that consecutive schedule times are exactly 1 minute apart
+    local prev_epoch
+    prev_epoch=$(date -d "${unique_times[0]}" +%s)
+    for ((i=1; i<${#unique_times[@]}; i++)); do
+        local curr
+        curr=$(date -d "${unique_times[i]}" +%s)
+        [ $((curr - prev_epoch)) -eq 60 ] || return 1
+        prev_epoch=$curr
+    done
+    return 0
+}
+
 test_others() {
     echo "testing import, export, resize, and snapshots..."
     TMP_FILES="/tmp/img1 /tmp/img1.new /tmp/img2 /tmp/img2.new /tmp/img3 /tmp/img3.new /tmp/img-diff1.new /tmp/img-diff2.new /tmp/img-diff3.new /tmp/img1.snap1 /tmp/img1.snap1 /tmp/img-diff1.snap1"
@@ -1336,6 +1365,146 @@ test_trash_purge_schedule_recovery() {
 
 }
 
+test_trash_purge_schedule_staggering() {
+    echo "Testing trash purge schedule staggering..."
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+
+    # Initial empty check
+    test "$(ceph rbd trash purge schedule list)" = "{}"
+    ceph rbd trash purge schedule status | fgrep '"scheduled": []'
+
+    # Create 80 namespaces
+    for i in {1..80}; do
+        rbd namespace create "rbd2/test$i"
+    done
+
+    # Helper to get status JSON and verify all namespaces are scheduled
+    get_trash_purge_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        # Verify number of namespaces in list output
+        local list_json
+        list_json=$(rbd trash purge schedule ls -p rbd2 -R --format json)
+        [ "$(jq 'length' <<< "$list_json")" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it reflects the same number of scheduled namespaces
+        for ((j=0; j<12; j++)); do
+            status_ref=$(rbd trash purge schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] && break
+            sleep 10
+        done
+        [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] || return 1
+
+        # Verify namespaces in list and status outputs match
+        local list_namespaces
+        list_namespaces=$(jq -r 'sort_by(.namespace) | .[].namespace' <<< "$list_json")
+        local status_namespaces
+        status_namespaces=$(jq -r 'sort_by(.namespace) | .[].namespace' <<< "$status_ref")
+        [ "$list_namespaces" = "$status_namespaces" ] || return 1
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    # Schedule namespaces test1..test60
+    for ((i=1; i<=60; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 60 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test1..test70 (add 10 namespaces)
+    for ((i=61; i<=70; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 70 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test1..test80 (add 10 more namespaces)
+    for ((i=71; i<=80; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 80 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Split into:
+    #   first half = test1..test40
+    #   second half = test41..test80
+    local first_half_json
+    first_half_json=$(jq '
+        map(select(.namespace | test("^test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local second_half_json
+    second_half_json=$(jq '
+        map(select(.namespace | test("^test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    # Both halves must be staggered
+    are_schedules_staggered "$first_half_json" "$interval_min"
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Modify scheduling range to test41..test80 (drop first half)
+    for ((i=1; i<=40; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+    get_trash_purge_schedule_status 40 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Re-add schedules for first half with explicit start time.
+    # These should all share the same next schedule_time.
+    for ((i=1; i<=40; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" 2020-01-01
+    done
+    # Get updated status
+    get_trash_purge_schedule_status 80 status_json
+
+    # Verify first half share the same next schedule_time
+    first_half_json=$(jq '
+        map(select(.namespace | test("^test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local anchored_times=()
+    mapfile -t anchored_times < <(
+        jq -r '.[].schedule_time' <<< "$first_half_json" | sort -u
+    )
+    (( ${#anchored_times[@]} == 1 )) || return 1
+
+    # Verify second half remains staggered
+    second_half_json=$(jq '
+        map(select(.namespace | test("^test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=80; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    for ((j=0; j<12; j++)); do
+        status_json=$(rbd trash purge schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$status_json")" -eq 0 ] && break
+        sleep 10
+    done
+    [ "$(jq 'length' <<< "$status_json")" -eq 0 ] || {
+        echo "Error: trash purge schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove namespaces
+    for ((i=1; i<=80; i++)); do
+        rbd namespace rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
 test_mirror_snapshot_schedule() {
     echo "testing mirror snapshot schedule..."
     remove_images
@@ -1522,6 +1691,153 @@ test_mirror_snapshot_schedule_recovery() {
     ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it
 }
 
+test_mirror_snapshot_schedule_staggering() {
+    echo "Testing mirror snapshot schedule staggering..."
+
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+    rbd mirror pool enable rbd2 image
+    rbd mirror pool peer add rbd2 cluster1
+
+    # Initial empty check
+    test "$(ceph rbd mirror snapshot schedule list)" = "{}"
+    ceph rbd mirror snapshot schedule status | fgrep '"scheduled_images": []'
+
+    # Create 80 images
+    for i in {1..80}; do
+        rbd create $RBD_CREATE_ARGS -s 1 "rbd2/test$i"
+        rbd mirror image enable "rbd2/test$i" snapshot
+    done
+
+    # Helper to get status JSON and verify all images are scheduled
+    get_mirror_snapshot_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        # Verify number of images in list output
+        local list_json
+        list_json=$(rbd mirror snapshot schedule ls -p rbd2 -R --format json)
+        [ "$(jq 'length' <<< "$list_json")" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it reflects the same number of scheduled images
+        for ((j=0; j<12; j++)); do
+            status_ref=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] && break
+            sleep 10
+        done
+        [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] || return 1
+
+        # Verify images in list and status outputs match
+        local list_images
+        list_images=$(jq -r 'sort_by(.image) | .[].image' <<< "$list_json")
+        # In status JSON, '.image' contains full image spec, not just the name
+        local status_images
+        status_images=$(
+            jq -r 'sort_by(.image) | .[].image | split("/")[-1]' <<< "$status_ref"
+        )
+        [ "$list_images" = "$status_images" ] || return 1
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    # Schedule images test1..test60
+    for ((i=1; i<=60; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 60 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test61..test70 (add 10 images)
+    for ((i=61; i<=70; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 70 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test71..test80 (add 10 more images)
+    for ((i=70; i<=80; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 80 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Split into:
+    #   first half = test1..test40
+    #   second half = test41..test80
+    local first_half_json
+    first_half_json=$(jq '
+        map(select(.image | test("^rbd2/test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local second_half_json
+    second_half_json=$(jq '
+        map(select(.image | test("^rbd2/test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    # Both halves must be staggered
+    are_schedules_staggered "$first_half_json" "$interval_min"
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Modify scheduling range to test41..test80 (drop first half)
+    for ((i=1; i<=40; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+    get_mirror_snapshot_schedule_status 40 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Re-add schedules for first half with explicit start time.
+    # These should all share the same next schedule_time.
+    for ((i=1; i<=40; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" 2020-01-01
+    done
+    # Get updated status
+    get_mirror_snapshot_schedule_status 80 status_json
+
+    # Verify first half share the same next schedule_time
+    first_half_json=$(jq '
+        map(select(.image | test("^rbd2/test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local anchored_times=()
+    mapfile -t anchored_times < <(
+        jq -r '.[].schedule_time' <<< "$first_half_json" | sort -u
+    )
+    (( ${#anchored_times[@]} == 1 )) || return 1
+
+    # Verify second half remains staggered
+    second_half_json=$(jq '
+        map(select(.image | test("^rbd2/test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=80; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    for ((j=0; j<12; j++)); do
+        status_json=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$status_json")" -eq 0 ] && break
+        sleep 10
+    done
+    [ "$(jq 'length' <<< "$status_json")" -eq 0 ] || {
+        echo "Error: mirror snapshot schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove images
+    for ((i=1; i<=80; i++)); do
+        rbd rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
 test_perf_image_iostat() {
     echo "testing perf image iostat..."
     remove_images
@@ -1780,8 +2096,10 @@ test_thick_provision
 test_namespace
 test_trash_purge_schedule
 test_trash_purge_schedule_recovery
+test_trash_purge_schedule_staggering
 test_mirror_snapshot_schedule
 test_mirror_snapshot_schedule_recovery
+test_mirror_snapshot_schedule_staggering
 test_perf_image_iostat
 test_perf_image_iostat_recovery
 test_mirror_pool_peer_bootstrap_create
diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
index 02e2b7882eb..d2133a4a143 100644
--- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
+++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
@@ -1,6 +1,7 @@
 import errno
 import json
 import rados
+import random
 import rbd
 import traceback
 
@@ -519,7 +520,8 @@ class MirrorSnapshotScheduleHandler:
                     pool_id, namespace, image_id))
             return
 
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(
+            now, "{}/{}/{}".format(pool_id, namespace, image_id))
         if schedule_time not in self.queue:
             self.queue[schedule_time] = []
         self.log.debug(
@@ -540,7 +542,8 @@ class MirrorSnapshotScheduleHandler:
             return None, (schedule_time - now).total_seconds()
 
         images = self.queue[schedule_time]
-        image = images.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        image = images.pop(rng.randrange(len(images)))
         if not images:
             del self.queue[schedule_time]
         return image, 0.0
diff --git a/src/pybind/mgr/rbd_support/schedule.py b/src/pybind/mgr/rbd_support/schedule.py
index 173ef7e6d5e..fc4d40018f7 100644
--- a/src/pybind/mgr/rbd_support/schedule.py
+++ b/src/pybind/mgr/rbd_support/schedule.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 import rados
 import rbd
@@ -329,12 +330,27 @@ class Schedule:
                start_time: Optional[StartTime] = None) -> None:
         self.items.discard((interval, start_time))
 
-    def next_run(self, now: datetime) -> datetime:
+    @staticmethod
+    def _compute_phase_offset_minutes(entity_id: str, period_minutes: int) -> int:
+        key = entity_id + "|" + str(period_minutes)
+        h = hashlib.md5(key.encode("utf-8")).hexdigest()
+        val = int(h, 16)
+        return (val % period_minutes)
+
+    def next_run(self, now: datetime, entity_id: str) -> datetime:
         schedule_time = None
 
         for interval, start_time in self.items:
             period = timedelta(minutes=interval.minutes)
-            anchor_time = start_time.dt if start_time else datetime(1970, 1, 1, tzinfo=timezone.utc)
+            if start_time:
+                anchor_time = start_time.dt
+            else:
+                phase_offset_minutes = self._compute_phase_offset_minutes(
+                    entity_id, interval.minutes)
+                anchor_time = (
+                    datetime(1970, 1, 1, tzinfo=timezone.utc)
+                    + timedelta(minutes=phase_offset_minutes)
+                )
 
             if anchor_time > now:
                 candidate_time = anchor_time
diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py
index b9774d18e3d..9284a41480e 100644
--- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py
+++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py
@@ -1,5 +1,6 @@
 import json
 import rados
+import random
 import rbd
 import traceback
 
@@ -177,7 +178,8 @@ class TrashPurgeScheduleHandler:
                     pool_id, namespace))
             return
 
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(now,
+                                          "{}/{}".format(pool_id, namespace))
         if schedule_time not in self.queue:
             self.queue[schedule_time] = []
         self.log.debug(
@@ -198,7 +200,8 @@ class TrashPurgeScheduleHandler:
             return None, (schedule_time - now).total_seconds()
 
         namespaces = self.queue[schedule_time]
-        namespace = namespaces.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        namespace = namespaces.pop(rng.randrange(len(namespaces)))
         if not namespaces:
             del self.queue[schedule_time]
         return namespace, 0.0