mgr/rbd_support: Stagger mirror snapshot and trash purge schedules

author Ramana Raja <rraja@redhat.com>

Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)

committer Ramana Raja <rraja@redhat.com>

Tue, 3 Mar 2026 18:12:45 +0000 (13:12 -0500)
author Ramana Raja <rraja@redhat.com>
Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)
committer Ramana Raja <rraja@redhat.com>
Tue, 3 Mar 2026 18:12:45 +0000 (13:12 -0500)
diff --git a/PendingReleaseNotes b/PendingReleaseNotes

index 6a6357285f0248c395913c38b41e880c52c396bc..780cb07daaf44015a5cd9992cb15a3802ca914bd 100644 (file)
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -54,6 +54,9 @@
    ``ceph fs snapshot mirror daemon status`` now shows the remote cluster's
    monitor addresses and cluster ID for each configured peer, making it easier
    to verify peer connectivity and troubleshoot mirroring issues.
+* RBD: Mirror snapshot creation and trash purge schedules are now automatically
+  staggered when no explicit "start-time" is specified. This reduces scheduling
+  spikes and distributes work more evenly over time.
  
  >=20.0.0
  
diff --git a/doc/rbd/rbd-mirroring.rst b/doc/rbd/rbd-mirroring.rst

index 7602191ca81894bc5920b25b142494b13b11c2e8..5357ce215548dc3fa21895bf9f66f9f1773c8f20 100644 (file)
--- a/doc/rbd/rbd-mirroring.rst
+++ b/doc/rbd/rbd-mirroring.rst
@@ -421,6 +421,10 @@ globally, per-pool, or per-image levels. Multiple mirror-snapshot schedules can
  be defined at any level, but only the most-specific snapshot schedules that
  match an individual mirrored image will run.
  
+When multiple images share the same schedule interval and no explicit
+``start-time`` is defined, snapshot creation is automatically staggered across
+the interval to reduce scheduling spikes.
+
  To create a mirror-snapshot schedule with ``rbd``, specify the
  ``mirror snapshot schedule add`` command along with an optional pool or
  image name; interval; and optional start time::
diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh

index 8cef48b238cd8e90647e063feb75b5082e13cb96..36fb254eb3790d14c0e433fccc07bdfdd17a56a3 100755 (executable)
--- a/qa/workunits/rbd/cli_generic.sh
+++ b/qa/workunits/rbd/cli_generic.sh
@@ -25,6 +25,35 @@ remove_images() {
      done
  }
  
+# Verifies that the provided schedule status JSON represents a properly
+# staggered schedule for the given interval.
+are_schedules_staggered() {
+    # $1: Status JSON output of a scheduler in the rbd_support mgr module
+    #     (e.g. `rbd trash purge schedule status --format json`)
+    local status_json=$1
+    # $2: Schedule interval in minutes
+    local interval_min=$2
+
+    local unique_times=()
+    mapfile -t unique_times < <(jq -r '.[].schedule_time' <<< "$status_json" | sort -u)
+
+    # Expect one unique time slot per interval minute (1-minute scheduler granularity).
+    # Allow one extra time slot in case status is observed during cycle rollover
+    (( ${#unique_times[@]} == interval_min ||
+       ${#unique_times[@]} == interval_min + 1 )) || return 1
+
+    # Check that consecutive schedule times are exactly 1 minute apart
+    local prev_epoch
+    prev_epoch=$(date -d "${unique_times[0]}" +%s)
+    for ((i=1; i<${#unique_times[@]}; i++)); do
+        local curr
+        curr=$(date -d "${unique_times[i]}" +%s)
+        [ $((curr - prev_epoch)) -eq 60 ] || return 1
+        prev_epoch=$curr
+    done
+    return 0
+}
+
  test_others() {
      echo "testing import, export, resize, and snapshots..."
      TMP_FILES="/tmp/img1 /tmp/img1.new /tmp/img2 /tmp/img2.new /tmp/img3 /tmp/img3.new /tmp/img-diff1.new /tmp/img-diff2.new /tmp/img-diff3.new /tmp/img1.snap1 /tmp/img1.snap1 /tmp/img-diff1.snap1"
@@ -1336,6 +1365,146 @@ test_trash_purge_schedule_recovery() {
  
  }
  
+test_trash_purge_schedule_staggering() {
+    echo "Testing trash purge schedule staggering..."
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+
+    # Initial empty check
+    test "$(ceph rbd trash purge schedule list)" = "{}"
+    ceph rbd trash purge schedule status | fgrep '"scheduled": []'
+
+    # Create 80 namespaces
+    for i in {1..80}; do
+        rbd namespace create "rbd2/test$i"
+    done
+
+    # Helper to get status JSON and verify all namespaces are scheduled
+    get_trash_purge_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        # Verify number of namespaces in list output
+        local list_json
+        list_json=$(rbd trash purge schedule ls -p rbd2 -R --format json)
+        [ "$(jq 'length' <<< "$list_json")" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it reflects the same number of scheduled namespaces
+        for ((j=0; j<12; j++)); do
+            status_ref=$(rbd trash purge schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] && break
+            sleep 10
+        done
+        [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] || return 1
+
+        # Verify namespaces in list and status outputs match
+        local list_namespaces
+        list_namespaces=$(jq -r 'sort_by(.namespace) | .[].namespace' <<< "$list_json")
+        local status_namespaces
+        status_namespaces=$(jq -r 'sort_by(.namespace) | .[].namespace' <<< "$status_ref")
+        [ "$list_namespaces" = "$status_namespaces" ] || return 1
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    # Schedule namespaces test1..test60
+    for ((i=1; i<=60; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 60 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test1..test70 (add 10 namespaces)
+    for ((i=61; i<=70; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 70 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test1..test80 (add 10 more namespaces)
+    for ((i=71; i<=80; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m"
+    done
+    get_trash_purge_schedule_status 80 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Split into:
+    #   first half = test1..test40
+    #   second half = test41..test80
+    local first_half_json
+    first_half_json=$(jq '
+        map(select(.namespace | test("^test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local second_half_json
+    second_half_json=$(jq '
+        map(select(.namespace | test("^test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    # Both halves must be staggered
+    are_schedules_staggered "$first_half_json" "$interval_min"
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Modify scheduling range to test41..test80 (drop first half)
+    for ((i=1; i<=40; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+    get_trash_purge_schedule_status 40 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Re-add schedules for first half with explicit start time.
+    # These should all share the same next schedule_time.
+    for ((i=1; i<=40; i++)); do
+        rbd trash purge schedule add -p "rbd2/test$i" "${interval_min}m" 2020-01-01
+    done
+    # Get updated status
+    get_trash_purge_schedule_status 80 status_json
+
+    # Verify first half share the same next schedule_time
+    first_half_json=$(jq '
+        map(select(.namespace | test("^test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local anchored_times=()
+    mapfile -t anchored_times < <(
+        jq -r '.[].schedule_time' <<< "$first_half_json" | sort -u
+    )
+    (( ${#anchored_times[@]} == 1 )) || return 1
+
+    # Verify second half remains staggered
+    second_half_json=$(jq '
+        map(select(.namespace | test("^test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=80; i++)); do
+        rbd trash purge schedule rm -p "rbd2/test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    for ((j=0; j<12; j++)); do
+        status_json=$(rbd trash purge schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$status_json")" -eq 0 ] && break
+        sleep 10
+    done
+    [ "$(jq 'length' <<< "$status_json")" -eq 0 ] || {
+        echo "Error: trash purge schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove namespaces
+    for ((i=1; i<=80; i++)); do
+        rbd namespace rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
  test_mirror_snapshot_schedule() {
      echo "testing mirror snapshot schedule..."
      remove_images
@@ -1522,6 +1691,153 @@ test_mirror_snapshot_schedule_recovery() {
      ceph osd pool rm rbd3 rbd3 --yes-i-really-really-mean-it
  }
  
+test_mirror_snapshot_schedule_staggering() {
+    echo "Testing mirror snapshot schedule staggering..."
+
+    remove_images
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+    rbd mirror pool enable rbd2 image
+    rbd mirror pool peer add rbd2 cluster1
+
+    # Initial empty check
+    test "$(ceph rbd mirror snapshot schedule list)" = "{}"
+    ceph rbd mirror snapshot schedule status | fgrep '"scheduled_images": []'
+
+    # Create 80 images
+    for i in {1..80}; do
+        rbd create $RBD_CREATE_ARGS -s 1 "rbd2/test$i"
+        rbd mirror image enable "rbd2/test$i" snapshot
+    done
+
+    # Helper to get status JSON and verify all images are scheduled
+    get_mirror_snapshot_schedule_status() {
+        local num_scheduled=$1
+        local -n status_ref=$2
+
+        # Verify number of images in list output
+        local list_json
+        list_json=$(rbd mirror snapshot schedule ls -p rbd2 -R --format json)
+        [ "$(jq 'length' <<< "$list_json")" -eq "$num_scheduled" ] || return 1
+
+        # Poll status until it reflects the same number of scheduled images
+        for ((j=0; j<12; j++)); do
+            status_ref=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+            [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] && break
+            sleep 10
+        done
+        [ "$(jq 'length' <<< "$status_ref")" -eq "$num_scheduled" ] || return 1
+
+        # Verify images in list and status outputs match
+        local list_images
+        list_images=$(jq -r 'sort_by(.image) | .[].image' <<< "$list_json")
+        # In status JSON, '.image' contains full image spec, not just the name
+        local status_images
+        status_images=$(
+            jq -r 'sort_by(.image) | .[].image | split("/")[-1]' <<< "$status_ref"
+        )
+        [ "$list_images" = "$status_images" ] || return 1
+        return 0
+    }
+
+    # Verify that `schedule add/rm` maintains proper staggering
+    local interval_min=5
+    local status_json
+    # Schedule images test1..test60
+    for ((i=1; i<=60; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 60 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test61..test70 (add 10 images)
+    for ((i=61; i<=70; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 70 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Modify scheduling range to test71..test80 (add 10 more images)
+    for ((i=70; i<=80; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m"
+    done
+    get_mirror_snapshot_schedule_status 80 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Split into:
+    #   first half = test1..test40
+    #   second half = test41..test80
+    local first_half_json
+    first_half_json=$(jq '
+        map(select(.image | test("^rbd2/test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local second_half_json
+    second_half_json=$(jq '
+        map(select(.image | test("^rbd2/test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    # Both halves must be staggered
+    are_schedules_staggered "$first_half_json" "$interval_min"
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Modify scheduling range to test41..test80 (drop first half)
+    for ((i=1; i<=40; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+    get_mirror_snapshot_schedule_status 40 status_json
+    are_schedules_staggered "$status_json" "$interval_min"
+
+    # Re-add schedules for first half with explicit start time.
+    # These should all share the same next schedule_time.
+    for ((i=1; i<=40; i++)); do
+        rbd mirror snapshot schedule add -p rbd2 --image "test$i" "${interval_min}m" 2020-01-01
+    done
+    # Get updated status
+    get_mirror_snapshot_schedule_status 80 status_json
+
+    # Verify first half share the same next schedule_time
+    first_half_json=$(jq '
+        map(select(.image | test("^rbd2/test([1-9]|[1-3][0-9]|40)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$first_half_json")" -eq 40 ] || return 1
+    local anchored_times=()
+    mapfile -t anchored_times < <(
+        jq -r '.[].schedule_time' <<< "$first_half_json" | sort -u
+    )
+    (( ${#anchored_times[@]} == 1 )) || return 1
+
+    # Verify second half remains staggered
+    second_half_json=$(jq '
+        map(select(.image | test("^rbd2/test(4[1-9]|[5-7][0-9]|80)$")))
+    ' <<< "$status_json")
+    [ "$(jq 'length' <<< "$second_half_json")" -eq 40 ] || return 1
+    are_schedules_staggered "$second_half_json" "$interval_min"
+
+    # Cleanup: remove all schedules
+    for ((i=1; i<=80; i++)); do
+        rbd mirror snapshot schedule rm -p rbd2 --image "test$i"
+    done
+
+    # Wait until schedule status becomes empty
+    for ((j=0; j<12; j++)); do
+        status_json=$(rbd mirror snapshot schedule status -p rbd2 --format json)
+        [ "$(jq 'length' <<< "$status_json")" -eq 0 ] && break
+        sleep 10
+    done
+    [ "$(jq 'length' <<< "$status_json")" -eq 0 ] || {
+        echo "Error: mirror snapshot schedule status not empty after removals"
+        return 1
+    }
+
+    # Remove images
+    for ((i=1; i<=80; i++)); do
+        rbd rm "rbd2/test$i"
+    done
+
+    ceph osd pool rm rbd2 rbd2 --yes-i-really-really-mean-it
+}
+
  test_perf_image_iostat() {
      echo "testing perf image iostat..."
      remove_images
@@ -1780,8 +2096,10 @@ test_thick_provision
  test_namespace
  test_trash_purge_schedule
  test_trash_purge_schedule_recovery
+test_trash_purge_schedule_staggering
  test_mirror_snapshot_schedule
  test_mirror_snapshot_schedule_recovery
+test_mirror_snapshot_schedule_staggering
  test_perf_image_iostat
  test_perf_image_iostat_recovery
  test_mirror_pool_peer_bootstrap_create
diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py

index 02e2b7882eb415b5575e2694d444dea03b1b61ba..d2133a4a1437a3401a2dce71ccf6ccd3804b2db4 100644 (file)
--- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
+++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
@@ -1,6 +1,7 @@
  import errno
  import json
  import rados
+import random
  import rbd
  import traceback
  
@@ -519,7 +520,8 @@ class MirrorSnapshotScheduleHandler:
                      pool_id, namespace, image_id))
              return
  
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(
+            now, "{}/{}/{}".format(pool_id, namespace, image_id))
          if schedule_time not in self.queue:
              self.queue[schedule_time] = []
          self.log.debug(
@@ -540,7 +542,8 @@ class MirrorSnapshotScheduleHandler:
              return None, (schedule_time - now).total_seconds()
  
          images = self.queue[schedule_time]
-        image = images.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        image = images.pop(rng.randrange(len(images)))
          if not images:
              del self.queue[schedule_time]
          return image, 0.0
diff --git a/src/pybind/mgr/rbd_support/schedule.py b/src/pybind/mgr/rbd_support/schedule.py

index 173ef7e6d5eb8ee0942a0eb59069ffbe285f6e64..fc4d40018f722b6d361abd1af6602e92d970d047 100644 (file)
--- a/src/pybind/mgr/rbd_support/schedule.py
+++ b/src/pybind/mgr/rbd_support/schedule.py
@@ -1,3 +1,4 @@
+import hashlib
  import json
  import rados
  import rbd
@@ -329,12 +330,27 @@ class Schedule:
                 start_time: Optional[StartTime] = None) -> None:
          self.items.discard((interval, start_time))
  
-    def next_run(self, now: datetime) -> datetime:
+    @staticmethod
+    def _compute_phase_offset_minutes(entity_id: str, period_minutes: int) -> int:
+        key = entity_id + "|" + str(period_minutes)
+        h = hashlib.md5(key.encode("utf-8")).hexdigest()
+        val = int(h, 16)
+        return (val % period_minutes)
+
+    def next_run(self, now: datetime, entity_id: str) -> datetime:
          schedule_time = None
  
          for interval, start_time in self.items:
              period = timedelta(minutes=interval.minutes)
-            anchor_time = start_time.dt if start_time else datetime(1970, 1, 1, tzinfo=timezone.utc)
+            if start_time:
+                anchor_time = start_time.dt
+            else:
+                phase_offset_minutes = self._compute_phase_offset_minutes(
+                    entity_id, interval.minutes)
+                anchor_time = (
+                    datetime(1970, 1, 1, tzinfo=timezone.utc)
+                    + timedelta(minutes=phase_offset_minutes)
+                )
  
              if anchor_time > now:
                  candidate_time = anchor_time
diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py

index b9774d18e3d0ddd8c4a92c58acf0f8a84974decd..9284a41480e6fb619f42f35068ec4bf05c20a035 100644 (file)
--- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py
+++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py
@@ -1,5 +1,6 @@
  import json
  import rados
+import random
  import rbd
  import traceback
  
@@ -177,7 +178,8 @@ class TrashPurgeScheduleHandler:
                      pool_id, namespace))
              return
  
-        schedule_time = schedule.next_run(now)
+        schedule_time = schedule.next_run(now,
+                                          "{}/{}".format(pool_id, namespace))
          if schedule_time not in self.queue:
              self.queue[schedule_time] = []
          self.log.debug(
@@ -198,7 +200,8 @@ class TrashPurgeScheduleHandler:
              return None, (schedule_time - now).total_seconds()
  
          namespaces = self.queue[schedule_time]
-        namespace = namespaces.pop(0)
+        rng = random.Random(schedule_time.timestamp())
+        namespace = namespaces.pop(rng.randrange(len(namespaces)))
          if not namespaces:
              del self.queue[schedule_time]
          return namespace, 0.0
author	Ramana Raja <rraja@redhat.com>
	Mon, 29 Dec 2025 22:17:28 +0000 (17:17 -0500)
committer	Ramana Raja <rraja@redhat.com>
	Tue, 3 Mar 2026 18:12:45 +0000 (13:12 -0500)
PendingReleaseNotes		patch \| blob \| history
doc/rbd/rbd-mirroring.rst		patch \| blob \| history
qa/workunits/rbd/cli_generic.sh		patch \| blob \| history
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py		patch \| blob \| history
src/pybind/mgr/rbd_support/schedule.py		patch \| blob \| history
src/pybind/mgr/rbd_support/trash_purge_schedule.py		patch \| blob \| history