]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/rbd_support: always rescan image mirror snapshots on refresh 46777/head
authorIlya Dryomov <idryomov@gmail.com>
Sun, 19 Jun 2022 10:12:01 +0000 (12:12 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 21 Jun 2022 15:30:18 +0000 (17:30 +0200)
Establishing a watch on rbd_mirroring object and skipping rescanning
image mirror snapshots on periodic refresh unless rbd_mirroring object
gets notified in the interim is flawed.  rbd_mirroring object is
notified when mirroring is enabled or disabled on some image (including
when the image is removed), but it is not notified when images are
promoted or demoted.  However, load_pool_images() discards images that
are not primary at the time of the scan.  If the image is promoted
later, no snapshots are created even if the schedule is in place.  This
happens regardless of whether the schedule is added before or after the
promotion.

This effectively reverts commit 69259c8d3722 ("mgr/rbd_support: make
mirror_snapshot_schedule rescan only updated pools").  An alternative
fix could be to stop discarding non-primary images (i.e. drop

    if not info['primary']:
        continue

check added in commit d39eb283c5ce ("mgr/rbd_support: mirror snapshot
schedule should skip non-primary images")), but that would clutter the
queue and therefore "rbd mirror snapshot schedule status" output with
bogus entries.  Performing a rescan roughly every 60 seconds should be
manageable: currently it amounts to a single mirror_image_status_list
request, followed by mirror_image_get, get_snapcontext and snapshot_get
requests for each snapshot-based mirroring enabled image and concluded
by a single dir_list request.  Among these, per-image get_snapcontext
and snapshot_get requests are necessary for determining primaryness.

Fixes: https://tracker.ceph.com/issues/53914
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit 7fb4fdbed0b908a2105ac44deca48f2170e46fe5)

Conflicts:
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py [ commit
  e4a16e261370 ("mgr/rbd_support: add type annotation") not in
  octopus ]

qa/workunits/rbd/cli_generic.sh
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py

index 32c45790fa603fde13166233b0f1cffce2f35870..a1723509586e2d627fa0e4bcd5ca655172e4d6cd 100755 (executable)
@@ -1304,6 +1304,20 @@ test_mirror_snapshot_schedule() {
     test "$(rbd mirror snapshot schedule status -p rbd2/ns1 --image test1 --format xml |
         $XMLSTARLET sel -t -v '//scheduled_images/image/image')" = 'rbd2/ns1/test1'
 
+    rbd mirror image demote rbd2/ns1/test1
+    for i in `seq 12`; do
+        rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' || break
+        sleep 10
+    done
+    rbd mirror snapshot schedule status | expect_fail grep 'rbd2/ns1/test1'
+
+    rbd mirror image promote rbd2/ns1/test1
+    for i in `seq 12`; do
+        rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' && break
+        sleep 10
+    done
+    rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1'
+
     rbd mirror snapshot schedule add 1h 00:15
     test "$(rbd mirror snapshot schedule ls)" = 'every 1h starting at 00:15:00'
     rbd mirror snapshot schedule ls -R | grep 'every 1h starting at 00:15:00'
@@ -1325,11 +1339,11 @@ test_mirror_snapshot_schedule() {
     test "$(rbd mirror snapshot schedule ls -p rbd2/ns1 --image test1)" = 'every 1m'
 
     rbd rm rbd2/ns1/test1
-
     for i in `seq 12`; do
         rbd mirror snapshot schedule status | grep 'rbd2/ns1/test1' || break
         sleep 10
     done
+    rbd mirror snapshot schedule status | expect_fail grep 'rbd2/ns1/test1'
 
     rbd mirror snapshot schedule remove
     test "$(rbd mirror snapshot schedule ls -R --format json)" = "[]"
index eb10d0710b0f3f02963672c091cd3254e9bfae7f..536ee3d16c9997fdfd4fa2cbb5cb7977c99cd557 100644 (file)
@@ -11,8 +11,6 @@ from threading import Condition, Lock, Thread
 from .common import get_rbd_pools
 from .schedule import LevelSpec, Interval, StartTime, Schedule, Schedules
 
-MIRRORING_OID = "rbd_mirroring"
-
 def namespace_validator(ioctx):
     mode = rbd.RBD().mirror_mode_get(ioctx)
     if mode != rbd.RBD_MIRROR_MODE_IMAGE:
@@ -24,125 +22,6 @@ def image_validator(image):
     if mode != rbd.RBD_MIRROR_IMAGE_MODE_SNAPSHOT:
         raise rbd.InvalidArgument("Invalid mirror image mode")
 
-class Watchers:
-
-    lock = Lock()
-
-    def __init__(self, handler):
-        self.rados = handler.module.rados
-        self.log = handler.log
-        self.watchers = {}
-        self.updated = {}
-        self.error = {}
-        self.epoch = {}
-
-    def __del__(self):
-        self.unregister_all()
-
-    def _clean_watcher(self, pool_id, namespace, watch_id):
-        assert self.lock.locked()
-
-        del self.watchers[pool_id, namespace]
-        self.updated.pop(watch_id, None)
-        self.error.pop(watch_id, None)
-        self.epoch.pop(watch_id, None)
-
-    def check(self, pool_id, namespace, epoch):
-        error = None
-        with self.lock:
-            watch = self.watchers.get((pool_id, namespace))
-            if watch is not None:
-                error = self.error.get(watch.get_id())
-                if not error:
-                    updated = self.updated[watch.get_id()]
-                    self.updated[watch.get_id()] = False
-                    self.epoch[watch.get_id()] = epoch
-                    return updated
-        if error:
-            self.unregister(pool_id, namespace)
-
-        if self.register(pool_id, namespace):
-            return self.check(pool_id, namespace, epoch)
-        else:
-            return True
-
-    def register(self, pool_id, namespace):
-
-        def callback(notify_id, notifier_id, watch_id, data):
-            self.log.debug("watcher {}: got notify {} from {}".format(
-                watch_id, notify_id, notifier_id))
-
-            with self.lock:
-                self.updated[watch_id] = True
-
-        def error_callback(watch_id, error):
-            self.log.debug("watcher {}: got errror {}".format(
-                watch_id, error))
-
-            with self.lock:
-                self.error[watch_id] = error
-
-        try:
-            ioctx = self.rados.open_ioctx2(int(pool_id))
-            ioctx.set_namespace(namespace)
-            watch = ioctx.watch(MIRRORING_OID, callback, error_callback)
-        except rados.ObjectNotFound:
-            self.log.debug(
-                "{}/{}/{} watcher not registered: object not found".format(
-                    pool_id, namespace, MIRRORING_OID))
-            return False
-
-        self.log.debug("{}/{}/{} watcher {} registered".format(
-            pool_id, namespace, MIRRORING_OID, watch.get_id()))
-
-        with self.lock:
-            self.watchers[pool_id, namespace] = watch
-            self.updated[watch.get_id()] = True
-        return True
-
-    def unregister(self, pool_id, namespace):
-
-        with self.lock:
-            watch = self.watchers[pool_id, namespace]
-
-        watch_id = watch.get_id()
-
-        try:
-            watch.close()
-
-            self.log.debug("{}/{}/{} watcher {} unregistered".format(
-                pool_id, namespace, MIRRORING_OID, watch_id))
-
-        except rados.Error as e:
-            self.log.debug(
-                "exception when unregistering {}/{} watcher: {}".format(
-                    pool_id, namespace, e))
-
-        with self.lock:
-            self._clean_watcher(pool_id, namespace, watch_id)
-
-    def unregister_all(self):
-        with self.lock:
-            watchers = list(self.watchers)
-
-        for pool_id, namespace in watchers:
-            self.unregister(pool_id, namespace)
-
-    def unregister_stale(self, current_epoch):
-        with self.lock:
-            watchers = list(self.watchers)
-
-        for pool_id, namespace in watchers:
-            with self.lock:
-                watch = self.watchers[pool_id, namespace]
-                if self.epoch.get(watch.get_id()) == current_epoch:
-                    continue
-
-            self.log.debug("{}/{}/{} watcher {} stale".format(
-                pool_id, namespace, MIRRORING_OID, watch_id))
-
-            self.unregister(pool_id, namespace)
-
 
 class CreateSnapshotRequests:
 
@@ -439,7 +318,6 @@ class MirrorSnapshotScheduleHandler:
         self.thread.start()
 
     def _cleanup(self):
-        self.watchers.unregister_all()
         self.create_snapshot_requests.wait_for_pending()
 
     def run(self):
@@ -464,7 +342,6 @@ class MirrorSnapshotScheduleHandler:
     def init_schedule_queue(self):
         self.queue = {}
         self.images = {}
-        self.watchers = Watchers(self)
         self.refresh_images()
         self.log.debug("MirrorSnapshotScheduleHandler: queue is initialized")
 
@@ -486,13 +363,11 @@ class MirrorSnapshotScheduleHandler:
             self.load_schedules()
             if not self.schedules:
                 self.log.debug("MirrorSnapshotScheduleHandler: no schedules")
-                self.watchers.unregister_all()
                 self.images = {}
                 self.queue = {}
                 self.last_refresh_images = datetime.now()
                 return self.REFRESH_DELAY_SECONDS
 
-        epoch = int(datetime.now().strftime('%s'))
         images = {}
 
         for pool_id, pool_name in get_rbd_pools(self.module).items():
@@ -500,17 +375,16 @@ class MirrorSnapshotScheduleHandler:
                     LevelSpec.from_pool_spec(pool_id, pool_name)):
                 continue
             with self.module.rados.open_ioctx2(int(pool_id)) as ioctx:
-                self.load_pool_images(ioctx, epoch, images)
+                self.load_pool_images(ioctx, images)
 
         with self.lock:
             self.refresh_queue(images)
             self.images = images
 
-        self.watchers.unregister_stale(epoch)
         self.last_refresh_images = datetime.now()
         return self.REFRESH_DELAY_SECONDS
 
-    def load_pool_images(self, ioctx, epoch, images):
+    def load_pool_images(self, ioctx, images):
         pool_id = str(ioctx.get_pool_id())
         pool_name = ioctx.get_pool_name()
         images[pool_id] = {}
@@ -527,14 +401,6 @@ class MirrorSnapshotScheduleHandler:
                     pool_name, namespace))
                 images[pool_id][namespace] = {}
                 ioctx.set_namespace(namespace)
-                updated = self.watchers.check(pool_id, namespace, epoch)
-                if not updated:
-                    self.log.debug("load_pool_images: {}/{} not updated".format(
-                        pool_name, namespace))
-                    with self.lock:
-                        images[pool_id][namespace] = \
-                            self.images[pool_id][namespace]
-                    continue
                 mirror_images = dict(rbd.RBD().mirror_image_info_list(
                     ioctx, rbd.RBD_MIRROR_IMAGE_MODE_SNAPSHOT))
                 if not mirror_images: