]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/rbd_support: always rescan image mirror snapshots on refresh 46743/head
authorIlya Dryomov <idryomov@gmail.com>
Sun, 19 Jun 2022 10:12:01 +0000 (12:12 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 21 Jun 2022 10:46:49 +0000 (12:46 +0200)
Establishing a watch on rbd_mirroring object and skipping rescanning
image mirror snapshots on periodic refresh unless rbd_mirroring object
gets notified in the interim is flawed.  rbd_mirroring object is
notified when mirroring is enabled or disabled on some image (including
when the image is removed), but it is not notified when images are
promoted or demoted.  However, load_pool_images() discards images that
are not primary at the time of the scan.  If the image is promoted
later, no snapshots are created even if the schedule is in place.  This
happens regardless of whether the schedule is added before or after the
promotion.

This effectively reverts commit 69259c8d3722 ("mgr/rbd_support: make
mirror_snapshot_schedule rescan only updated pools").  An alternative
fix could be to stop discarding non-primary images (i.e. drop

    if not info['primary']:
        continue

check added in commit d39eb283c5ce ("mgr/rbd_support: mirror snapshot
schedule should skip non-primary images")), but that would clutter the
queue and therefore "rbd mirror snapshot schedule status" output with
bogus entries.  Performing a rescan roughly every 60 seconds should be
manageable: currently it amounts to a single mirror_image_status_list
request, followed by mirror_image_get, get_snapcontext and snapshot_get
requests for each snapshot-based mirroring enabled image and concluded
by a single dir_list request.  Among these, per-image get_snapcontext
and snapshot_get requests are necessary for determining primaryness.

Fixes: https://tracker.ceph.com/issues/53914
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
qa/workunits/rbd/cli_generic.sh
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py

index 6b403c09567dd4a662609439f49ae0ec25103b92..1a5b5d953d91ad9c7bdb15a51960a3809f8b550b 100755 (executable)
@@ -1310,6 +1310,20 @@ test_mirror_snapshot_schedule() {
     test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --image test1 --format xml |
         $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
 
+    rbd mirror image demote rbd2/ns1/test1
+    for i in `seq 12`; do
+        rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' || break
+        sleep 10
+    done
+    rbd mirror snapshot schedule status | expect_fail grep 'rbd2/ns1/test1'
+
+    rbd mirror image promote rbd2/ns1/test1
+    for i in `seq 12`; do
+        rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' && break
+        sleep 10
+    done
+    rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1'
+
     rbd mirror snapshot schedule add 1h 00:15
     test "$(rbd mirror snapshot schedule ls)" = 'every 1h starting at 00:15:00'
     rbd mirror snapshot schedule ls -R | grep 'every 1h starting at 00:15:00'
@@ -1331,11 +1345,11 @@ test_mirror_snapshot_schedule() {
     test "$(rbd mirror snapshot schedule ls -p rbd2/ns1 --image test1)" = 'every 1m'
 
     rbd rm rbd2/ns1/test1
-
     for i in `seq 12`; do
         rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' || break
         sleep 10
     done
+    rbd mirror snapshot schedule status | expect_fail grep 'rbd2/ns1/test1'
 
     rbd mirror snapshot schedule remove
     test "$(rbd mirror snapshot schedule ls -R --format json)" = "[]"
index be0bd8db2399f5bf7543e1879fcd9b095c6dccb8..c8cf4e4ee7ea303745bfcf9f3cd9f6f00b27e008 100644 (file)
@@ -12,8 +12,6 @@ from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple,
 from .common import get_rbd_pools
 from .schedule import LevelSpec, Interval, StartTime, Schedule, Schedules
 
-MIRRORING_OID = "rbd_mirroring"
-
 def namespace_validator(ioctx: rados.Ioctx) -> None:
     mode = rbd.RBD().mirror_mode_get(ioctx)
     if mode != rbd.RBD_MIRROR_MODE_IMAGE:
@@ -25,125 +23,6 @@ def image_validator(image: rbd.Image) -> None:
     if mode != rbd.RBD_MIRROR_IMAGE_MODE_SNAPSHOT:
         raise rbd.InvalidArgument("Invalid mirror image mode")
 
-class Watchers:
-
-    lock = Lock()
-
-    def __init__(self, handler: Any) -> None:
-        self.rados = handler.module.rados
-        self.log = handler.log
-        self.watchers: Dict[Tuple[str, str], rados.Watch] = {}
-        self.updated: Dict[int, bool] = {}
-        self.error: Dict[int, str] = {}
-        self.epoch: Dict[int, int] = {}
-
-    def __del__(self) -> None:
-        self.unregister_all()
-
-    def _clean_watcher(self, pool_id: str, namespace: str, watch_id: int) -> None:
-        assert self.lock.locked()
-
-        del self.watchers[pool_id, namespace]
-        self.updated.pop(watch_id, None)
-        self.error.pop(watch_id, None)
-        self.epoch.pop(watch_id, None)
-
-    def check(self, pool_id: str, namespace: str, epoch: int) -> bool:
-        error = None
-        with self.lock:
-            watch = self.watchers.get((pool_id, namespace))
-            if watch is not None:
-                error = self.error.get(watch.get_id())
-                if not error:
-                    updated = self.updated[watch.get_id()]
-                    self.updated[watch.get_id()] = False
-                    self.epoch[watch.get_id()] = epoch
-                    return updated
-        if error:
-            self.unregister(pool_id, namespace)
-
-        if self.register(pool_id, namespace):
-            return self.check(pool_id, namespace, epoch)
-        else:
-            return True
-
-    def register(self, pool_id: str, namespace: str) -> bool:
-
-        def callback(notify_id: str, notifier_id: str, watch_id: int, data: str) -> None:
-            self.log.debug("watcher {}: got notify {} from {}".format(
-                watch_id, notify_id, notifier_id))
-
-            with self.lock:
-                self.updated[watch_id] = True
-
-        def error_callback(watch_id: int, error: str) -> None:
-            self.log.debug("watcher {}: got errror {}".format(
-                watch_id, error))
-
-            with self.lock:
-                self.error[watch_id] = error
-
-        try:
-            ioctx = self.rados.open_ioctx2(int(pool_id))
-            ioctx.set_namespace(namespace)
-            watch = ioctx.watch(MIRRORING_OID, callback, error_callback)
-        except rados.ObjectNotFound:
-            self.log.debug(
-                "{}/{}/{} watcher not registered: object not found".format(
-                    pool_id, namespace, MIRRORING_OID))
-            return False
-
-        self.log.debug("{}/{}/{} watcher {} registered".format(
-            pool_id, namespace, MIRRORING_OID, watch.get_id()))
-
-        with self.lock:
-            self.watchers[pool_id, namespace] = watch
-            self.updated[watch.get_id()] = True
-        return True
-
-    def unregister(self, pool_id: str, namespace: str) -> None:
-
-        with self.lock:
-            watch = self.watchers[pool_id, namespace]
-
-        watch_id = watch.get_id()
-
-        try:
-            watch.close()
-
-            self.log.debug("{}/{}/{} watcher {} unregistered".format(
-                pool_id, namespace, MIRRORING_OID, watch_id))
-
-        except rados.Error as e:
-            self.log.debug(
-                "exception when unregistering {}/{} watcher: {}".format(
-                    pool_id, namespace, e))
-
-        with self.lock:
-            self._clean_watcher(pool_id, namespace, watch_id)
-
-    def unregister_all(self) -> None:
-        with self.lock:
-            watchers = list(self.watchers)
-
-        for pool_id, namespace in watchers:
-            self.unregister(pool_id, namespace)
-
-    def unregister_stale(self, current_epoch: int) -> None:
-        with self.lock:
-            watchers = list(self.watchers)
-
-        for pool_id, namespace in watchers:
-            with self.lock:
-                watch = self.watchers[pool_id, namespace]
-                if self.epoch.get(watch.get_id()) == current_epoch:
-                    continue
-
-            self.log.debug("{}/{}/{} watcher {} stale".format(
-                pool_id, namespace, MIRRORING_OID, watch.get_id()))
-
-            self.unregister(pool_id, namespace)
-
 
 class ImageSpec(NamedTuple):
     pool_id: str
@@ -464,7 +343,6 @@ class MirrorSnapshotScheduleHandler:
         self.thread.start()
 
     def _cleanup(self) -> None:
-        self.watchers.unregister_all()
         self.create_snapshot_requests.wait_for_pending()
 
     def run(self) -> None:
@@ -491,7 +369,6 @@ class MirrorSnapshotScheduleHandler:
         self.queue: Dict[str, List[ImageSpec]] = {}
         # pool_id => {namespace => image_id}
         self.images: Dict[str, Dict[str, Dict[str, str]]] = {}
-        self.watchers = Watchers(self)
         self.refresh_images()
         self.log.debug("MirrorSnapshotScheduleHandler: queue is initialized")
 
@@ -513,13 +390,11 @@ class MirrorSnapshotScheduleHandler:
             self.load_schedules()
             if not self.schedules:
                 self.log.debug("MirrorSnapshotScheduleHandler: no schedules")
-                self.watchers.unregister_all()
                 self.images = {}
                 self.queue = {}
                 self.last_refresh_images = datetime.now()
                 return self.REFRESH_DELAY_SECONDS
 
-        epoch = int(datetime.now().strftime('%s'))
         images: Dict[str, Dict[str, Dict[str, str]]] = {}
 
         for pool_id, pool_name in get_rbd_pools(self.module).items():
@@ -527,19 +402,17 @@ class MirrorSnapshotScheduleHandler:
                     LevelSpec.from_pool_spec(pool_id, pool_name)):
                 continue
             with self.module.rados.open_ioctx2(int(pool_id)) as ioctx:
-                self.load_pool_images(ioctx, epoch, images)
+                self.load_pool_images(ioctx, images)
 
         with self.lock:
             self.refresh_queue(images)
             self.images = images
 
-        self.watchers.unregister_stale(epoch)
         self.last_refresh_images = datetime.now()
         return self.REFRESH_DELAY_SECONDS
 
     def load_pool_images(self,
                          ioctx: rados.Ioctx,
-                         epoch: int,
                          images: Dict[str, Dict[str, Dict[str, str]]]) -> None:
         pool_id = str(ioctx.get_pool_id())
         pool_name = ioctx.get_pool_name()
@@ -557,14 +430,6 @@ class MirrorSnapshotScheduleHandler:
                     pool_name, namespace))
                 images[pool_id][namespace] = {}
                 ioctx.set_namespace(namespace)
-                updated = self.watchers.check(pool_id, namespace, epoch)
-                if not updated:
-                    self.log.debug("load_pool_images: {}/{} not updated".format(
-                        pool_name, namespace))
-                    with self.lock:
-                        images[pool_id][namespace] = \
-                            self.images[pool_id][namespace]
-                    continue
                 mirror_images = dict(rbd.RBD().mirror_image_info_list(
                     ioctx, rbd.RBD_MIRROR_IMAGE_MODE_SNAPSHOT))
                 if not mirror_images: