mgr/rbd_support: Stagger mirror snapshot and trash purge schedules

author Ramana Raja <rraja@redhat.com>

Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)

committer Ramana Raja <rraja@redhat.com>

Wed, 25 Feb 2026 13:00:40 +0000 (08:00 -0500)
author Ramana Raja <rraja@redhat.com>
Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)
committer Ramana Raja <rraja@redhat.com>
Wed, 25 Feb 2026 13:00:40 +0000 (08:00 -0500)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index e14ae6e48bd16afa457e532954594a838d625561..0f366f1d001032a6dd79d0bdcfe1b59745f356a7 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -50,6 +50,9 @@
    synchronized using full copy, as blockdiff is not efficient for small files. The threshold is
    controlled by the new configuration option cephfs_mirror_blockdiff_min_file_size (default: 16_M).
    For more information, see https://tracker.ceph.com/issues/73452
+* RBD: Mirror snapshot creation and trash purge schedules are now automatically
+  staggered when no explicit "start-time" is specified. This reduces
+  synchronized scheduling spikes and distributes work more evenly over time.
  
  >=20.0.0
  
diff --git a/doc/rbd/rbd-mirroring.rst b/doc/rbd/rbd-mirroring.rst

index 7602191ca81894bc5920b25b142494b13b11c2e8..aaa28ac63d83e90d5fa80d863501a6d9125c9cab 100644 (file)
--- a/doc/rbd/rbd-mirroring.rst
+++ b/doc/rbd/rbd-mirroring.rst
@@ -421,6 +421,10 @@ globally, per-pool, or per-image levels. Multiple mirror-snapshot schedules can
  be defined at any level, but only the most-specific snapshot schedules that
  match an individual mirrored image will run.
  
+When multiple images share the same schedule interval and no explicit
+``start-time`` is defined, snapshot execution is automatically staggered across
+the interval to reduce scheduling spikes.
+
  To create a mirror-snapshot schedule with ``rbd``, specify the
  ``mirror snapshot schedule add`` command along with an optional pool or
  image name; interval; and optional start time::
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh

index 8cef48b238cd8e90647e063feb75b5082e13cb96..d59fa4e9b62b49e52e3e7e97e3e6b41dc82f3159 100755 (executable)
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -1336,6 +1336,163 @@ test_trash_purge_schedule_recovery() {
  
  }
  
+test_trash_purge_schedule_staggering() {
+    echo "Testing trash purge schedule staggering..."
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+
+    # Initial empty check
+    test "$(ceph rbd trash purge schedule list)" = "{}"
+    ceph rbd trash purge schedule status | fgrep '"scheduled": []'
+
+    # Create 50 namespaces
+    for i in {1..50}; do
+        rbd namespace create "rbd2/test$i"
+    done
+
+    # Helper to get status JSON and verify all namespaces are scheduled
+    get_trash_purge_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        local list_json
+        list_json=$(rbd trash purge schedule ls -p rbd2 -R --format json)
+
+        local list_namespaces=()
+        mapfile -t list_namespaces < <(
+            jq -r 'sort_by(.namespace) | .[].namespace' <<< "$list_json"
+        )
+        [ "${#list_namespaces[@]}" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it has all scheduled namespaces
+        for ((j=0; j<12; j++)); do
+            status_ref=$(rbd trash purge schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "${#list_namespaces[@]}" ] && break
+            sleep 10
+        done
+
+        local status_namespaces=()
+        mapfile -t status_namespaces < <(
+            jq -r 'sort_by(.namespace) | .[].namespace' <<< "$status_ref"
+        )
+        for i in "${!list_namespaces[@]}"; do
+            [[ "${list_namespaces[i]}" != "${status_namespaces[i]}" ]] && return 1;
+        done
+        return 0
+    }
+
+    # Helper to check staggering of schedules
+    are_trash_purge_schedules_staggered() {
+        local status_json=$1
+        local interval_min=$2
+        local unique_times=()
+        mapfile -t unique_times < <(jq -r '.[].schedule_time' <<< "$status_json" | sort -u)
+
+        # Expect one unique time slot per interval minute (1-minute scheduler granularity)
+        [ "${#unique_times[@]}" -eq "$interval_min" ] || return 1
+
+        # Check that consecutive schedule times are exactly 1 minute apart
+        local prev_epoch=$(( $(date -d "${unique_times[0]}" +%s)/60 ))
+        for ((i=1;i<${#unique_times[@]};i++)); do
+            local curr=$(( $(date -d "${unique_times[i]}" +%s)/60 ))
+            [ $((curr - prev_epoch)) -eq 1 ] || return 1
+            prev_epoch=$curr
+        done
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    local num_scheduled_namespaces=40
+    # Schedule namespaces test1..test40
+    for ((i=1; i<=40; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json
+    are_trash_purge_schedules_staggered "$status_json" "$interval_min"
+
+    # Shift scheduling range to test6..test45
+    for ((i=41; i<=45; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    for ((i=1; i<=5; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+    get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json
+    are_trash_purge_schedules_staggered "$status_json" "$interval_min"
+
+    # Shift scheduling range to test11..test50
+    for ((i=46; i<=50; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    for ((i=6; i<=10; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+    get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json
+    are_trash_purge_schedules_staggered "$status_json" "$interval_min"
+
+    # Add schedules for test1..test10 with explicit start time.
+    # These should all share the same next schedule_time.
+    num_scheduled_namespaces=50
+    for ((i=1; i<=10; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" 2020-01-01
+    done
+
+    # Get updated status
+    get_trash_purge_schedule_status "$num_scheduled_namespaces" status_json
+
+    # Split status into two sets:
+    #   test1..test10  (explicit start-time)
+    #   test11..test50 (should remain staggered)
+    local anchored_times=()
+    local staggered_json
+
+    # Extract schedule times for namespaces test1..test10
+    mapfile -t anchored_times < <(
+        jq -r '.[]
+            | select(.namespace | test("^test([1-9]|10)$"))
+            | .schedule_time' <<< "$status_json" | sort -u
+    )
+
+    # All anchored schedules should share exactly one schedule_time
+    [ "${#anchored_times[@]}" -eq 1 ] || return 1
+
+    # Extract JSON only for namespaces test11..test50
+    staggered_json=$(jq '
+      map(select(.namespace | test("^test(1[1-9]|[2-4][0-9]|50)$")))
+    ' <<< "$status_json")
+
+    # Verify these remain properly staggered
+    are_trash_purge_schedules_staggered "$staggered_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=50; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    local empty_status
+    for ((j=0; j<12; j++)); do
+        empty_status=$(rbd trash purge schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] && break
+        sleep 5
+    done
+
+    [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] || {
+        echo "Error: trash purge schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove namespaces
+    for ((i=1; i<=50; i++)); do
+        rbd namespace rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
  test_mirror_snapshot_schedule() {
      echo "testing mirror snapshot schedule..."
      remove_images
@@ -1522,6 +1679,166 @@ test_mirror_snapshot_schedule_recovery() {
      ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it
  }
  
+test_mirror_snapshot_schedule_staggering() {
+    echo "Testing mirror snapshot schedule staggering..."
+
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+    rbd mirror pool enable rbd2 image
+    rbd mirror pool peer add rbd2 cluster1
+
+    # Initial empty check
+    test "$(ceph rbd mirror snapshot schedule list)" = "{}"
+    ceph rbd mirror snapshot schedule status | fgrep '"scheduled_images": []'
+
+    # Create 50 images
+    for i in {1..50}; do
+        rbd create $RBD_CREATE_ARGS -s 1 "rbd2/test$i"
+        rbd mirror image enable "rbd2/test$i" snapshot
+    done
+
+    # Helper to get status JSON and verify all images are scheduled
+    get_mirror_snapshot_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        local list_json
+        list_json=$(rbd mirror snapshot schedule ls -p rbd2 -R --format json)
+
+        local list_images=()
+        mapfile -t list_images < <(
+            jq -r 'sort_by(.image) | .[].image' <<< "$list_json"
+        )
+        [ "${#list_images[@]}" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it has all scheduled images
+        for ((j=0;j<12;j++)); do
+            status_ref=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "${#list_images[@]}" ] && break
+            sleep 10
+        done
+
+        local status_images=()
+        mapfile -t status_images < <(
+            jq -r 'sort_by(.image) | .[].image | split("/")[-1]' <<< "$status_ref"
+        )
+        for i in "${!list_images[@]}"; do
+            [[ "${list_images[i]}" != "${status_images[i]}" ]] && return 1;
+        done
+        return 0
+    }
+
+    # Helper to check staggering of schedules
+    are_mirror_snapshot_schedules_staggered() {
+        local status_json=$1 interval_min=$2
+        local unique_times=()
+        mapfile -t unique_times < <(
+            jq -r '.[].schedule_time' <<< "$status_json" | sort -u
+        )
+        # Expect one unique time slot per interval minute (1-minute scheduler granularity)
+        [ "${#unique_times[@]}" -eq "$interval_min" ] || return 1
+
+        # Check that consecutive schedule times are exactly 1 minute apart
+        local prev_epoch=$(( $(date -d "${unique_times[0]}" +%s)/60 ))
+        for ((i=1;i<${#unique_times[@]};i++)); do
+            local curr=$(( $(date -d "${unique_times[i]}" +%s)/60 ))
+            [ $((curr - prev_epoch)) -eq 1 ] || return 1
+            prev_epoch=$curr
+        done
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    local num_scheduled_images=40
+    # Schedule images test1..test40
+    for ((i=1; i<=40; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json
+    are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min"
+
+    # Shift scheduling range to test6..test45
+    for ((i=41; i<=45; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    for ((i=1; i<=5; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+    get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json
+    are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min"
+
+    # Shift scheduling range to test11..test50
+    for ((i=46; i<=50; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    for ((i=6; i<=10; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+    get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json
+    are_mirror_snapshot_schedules_staggered "$status_json" "$interval_min"
+
+    # Add schedules for test1..test10 with explicit start time.
+    # These should all share the same next schedule_time.
+    num_scheduled_images=50
+    for ((i=1; i<=10; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" 2020-01-01
+    done
+
+    # Get updated status
+    get_mirror_snapshot_schedule_status "$num_scheduled_images" status_json
+
+    # Split status into two sets:
+    #   test1..test10  (explicit start-time)
+    #   test11..test50 (should remain staggered)
+    local anchored_times=()
+    local staggered_json
+
+    # Extract schedule times for images test1..test10
+    mapfile -t anchored_times < <(
+        jq -r '.[]
+            | select(.image | test("^rbd2/test([1-9]|10)$"))
+            | .schedule_time' <<< "$status_json" | sort -u
+    )
+
+    # All anchored schedules should share exactly one schedule_time
+    [ "${#anchored_times[@]}" -eq 1 ] || return 1
+
+    # Extract JSON only for images test11..test50
+    staggered_json=$(jq '
+      map(select(.image | test("^rbd2/test(1[1-9]|[2-4][0-9]|50)$")))
+    ' <<< "$status_json")
+
+    # Verify these remain properly staggered
+    are_mirror_snapshot_schedules_staggered "$staggered_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=50; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    local empty_status
+    for ((j=0; j<12; j++)); do
+        empty_status=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] && break
+        sleep 5
+    done
+    [ "$(jq 'length' <<< "$empty_status")" -eq 0 ] || {
+        echo "Error: snapshot schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove images
+    for ((i=1; i<=50; i++)); do
+        rbd rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
  test_perf_image_iostat() {
      echo "testing perf image iostat..."
      remove_images
@@ -1780,8 +2097,10 @@ test_thick_provision
  test_namespace
  test_trash_purge_schedule
  test_trash_purge_schedule_recovery
+test_trash_purge_schedule_staggering
  test_mirror_snapshot_schedule
  test_mirror_snapshot_schedule_recovery
+test_mirror_snapshot_schedule_staggering
  test_perf_image_iostat
  test_perf_image_iostat_recovery
  test_mirror_pool_peer_bootstrap_create
diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py

index 02e2b7882eb415b5575e2694d444dea03b1b61ba..1b91a16254619b54ed488eae8893f9896505f10d 100644 (file)
--- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
+++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
@@ -1,6 +1,7 @@
  import errno
  import json
  import rados
+import random
  import rbd
  import traceback
  
@@ -519,7 +520,7 @@ class MirrorSnapshotScheduleHandler:
                      pool_id, namespace, image_id))
              return
  
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(now, image_id)
          if schedule_time not in self.queue:
              self.queue[schedule_time] = []
          self.log.debug(
@@ -540,7 +541,8 @@ class MirrorSnapshotScheduleHandler:
              return None, (schedule_time - now).total_seconds()
  
          images = self.queue[schedule_time]
-        image = images.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        image = images.pop(rng.randrange(len(images)))
          if not images:
              del self.queue[schedule_time]
          return image, 0.0
diff --git a/src/pybind/mgr/rbd_support/schedule.py b/src/pybind/mgr/rbd_support/schedule.py

index 173ef7e6d5eb8ee0942a0eb59069ffbe285f6e64..8866364516c133a2c2e5070b53cd81e3adcf9da0 100644 (file)
--- a/src/pybind/mgr/rbd_support/schedule.py
+++ b/src/pybind/mgr/rbd_support/schedule.py
@@ -1,3 +1,4 @@
+import hashlib
  import json
  import rados
  import rbd
@@ -329,12 +330,26 @@ class Schedule:
                 start_time: Optional[StartTime] = None) -> None:
          self.items.discard((interval, start_time))
  
-    def next_run(self, now: datetime) -> datetime:
+    @staticmethod
+    def _compute_phase_offset_minutes(entity_id: str, period_minutes: int) -> int:
+        key = entity_id + "|" + str(period_minutes)
+        h = hashlib.md5(key.encode("utf-8")).hexdigest()
+        val = int(h, 16)
+        return (val % period_minutes)
+
+    def next_run(self, now: datetime, entity_id: str) -> datetime:
          schedule_time = None
  
          for interval, start_time in self.items:
              period = timedelta(minutes=interval.minutes)
-            anchor_time = start_time.dt if start_time else datetime(1970, 1, 1, tzinfo=timezone.utc)
+            if start_time:
+                anchor_time = start_time.dt
+            else:
+                phase_offset_minutes = self._compute_phase_offset_minutes(entity_id, interval.minutes)
+                anchor_time = (
+                    datetime(1970, 1, 1, tzinfo=timezone.utc)
+                    + timedelta(minutes=phase_offset_minutes)
+                )
  
              if anchor_time > now:
                  candidate_time = anchor_time
diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py

index b9774d18e3d0ddd8c4a92c58acf0f8a84974decd..9284a41480e6fb619f42f35068ec4bf05c20a035 100644 (file)
--- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py
+++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py
@@ -1,5 +1,6 @@
  import json
  import rados
+import random
  import rbd
  import traceback
  
@@ -177,7 +178,8 @@ class TrashPurgeScheduleHandler:
                      pool_id, namespace))
              return
  
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(now,
+                                          "{}/{}".format(pool_id, namespace))
          if schedule_time not in self.queue:
              self.queue[schedule_time] = []
          self.log.debug(
@@ -198,7 +200,8 @@ class TrashPurgeScheduleHandler:
              return None, (schedule_time - now).total_seconds()
  
          namespaces = self.queue[schedule_time]
-        namespace = namespaces.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        namespace = namespaces.pop(rng.randrange(len(namespaces)))
          if not namespaces:
              del self.queue[schedule_time]
          return namespace, 0.0
author	Ramana Raja <rraja@redhat.com>
	Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)
committer	Ramana Raja <rraja@redhat.com>
	Wed, 25 Feb 2026 13:00:40 +0000 (08:00 -0500)
PendingReleaseNotes		patch \| blob \| history
doc/rbd/rbd-mirroring.rst		patch \| blob \| history
qa/workunits/rbd/cli_generic.sh		patch \| blob \| history
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py		patch \| blob \| history
src/pybind/mgr/rbd_support/schedule.py		patch \| blob \| history
src/pybind/mgr/rbd_support/trash_purge_schedule.py		patch \| blob \| history