]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/rbd_support: avoid losing a schedule on load vs add race 46734/head
authorIlya Dryomov <idryomov@gmail.com>
Fri, 17 Jun 2022 12:03:20 +0000 (14:03 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Fri, 17 Jun 2022 14:20:26 +0000 (16:20 +0200)
If load_schedules() (i.e. periodic refresh) races with add_schedule()
invoked by the user for a fresh image, that image's schedule may get
lost until the next rebuild (not refresh!) of the queue:

1. periodic refresh invokes load_schedules()
2. load_schedules() creates a new Schedules instance and loads
   schedules from rbd_mirror_snapshot_schedule object
3. add_schedule() is invoked for a new image (an image that isn't
   present in self.images) by the user
4. before load_schedules() can grab self.lock, add_schedule() commits
   the new schedule to rbd_mirror_snapshot_schedule object and adds it
   to self.schedules
5. load_schedules() grabs self.lock and reassigns self.schedules with
   Schedules instance that is now stale
6. periodic refresh invokes load_pool_images() which discovers the new
   image; eventually it is added to self.images
7. periodic refresh invokes refresh_queue() which attempts to enqueue()
   the new image; this fails because a matching schedule isn't present

The next periodic refresh recovers the discarded schedule from
rbd_mirror_snapshot_schedule object but no attempt to enqueue() that
image is made since it is already "known" at that point.  Despite the
schedule being in place, no snapshots are created until the queue is
rebuilt from scratch or rbd_support module is reloaded.

To fix that, extend self.lock critical sections so that add_schedule()
and remove_schedule() can't get stepped on by load_schedules().

Fixes: https://tracker.ceph.com/issues/56090
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
src/pybind/mgr/rbd_support/trash_purge_schedule.py

index ffcd357d2ed17d3a429c924eb0a5666a92105f49..be0bd8db2399f5bf7543e1879fcd9b095c6dccb8 100644 (file)
@@ -500,8 +500,7 @@ class MirrorSnapshotScheduleHandler:
 
         schedules = Schedules(self)
         schedules.load(namespace_validator, image_validator)
-        with self.lock:
-            self.schedules = schedules
+        self.schedules = schedules
 
     def refresh_images(self) -> float:
         elapsed = (datetime.now() - self.last_refresh_images).total_seconds()
@@ -510,9 +509,8 @@ class MirrorSnapshotScheduleHandler:
 
         self.log.debug("MirrorSnapshotScheduleHandler: refresh_images")
 
-        self.load_schedules()
-
         with self.lock:
+            self.load_schedules()
             if not self.schedules:
                 self.log.debug("MirrorSnapshotScheduleHandler: no schedules")
                 self.watchers.unregister_all()
@@ -595,25 +593,24 @@ class MirrorSnapshotScheduleHandler:
                     pool_name, e))
 
     def rebuild_queue(self) -> None:
-        with self.lock:
-            now = datetime.now()
+        now = datetime.now()
 
-            # don't remove from queue "due" images
-            now_string = datetime.strftime(now, "%Y-%m-%d %H:%M:00")
+        # don't remove from queue "due" images
+        now_string = datetime.strftime(now, "%Y-%m-%d %H:%M:00")
 
-            for schedule_time in list(self.queue):
-                if schedule_time > now_string:
-                    del self.queue[schedule_time]
+        for schedule_time in list(self.queue):
+            if schedule_time > now_string:
+                del self.queue[schedule_time]
 
-            if not self.schedules:
-                return
+        if not self.schedules:
+            return
 
-            for pool_id in self.images:
-                for namespace in self.images[pool_id]:
-                    for image_id in self.images[pool_id][namespace]:
-                        self.enqueue(now, pool_id, namespace, image_id)
+        for pool_id in self.images:
+            for namespace in self.images[pool_id]:
+                for image_id in self.images[pool_id][namespace]:
+                    self.enqueue(now, pool_id, namespace, image_id)
 
-            self.condition.notify()
+        self.condition.notify()
 
     def refresh_queue(self,
                       current_images: Dict[str, Dict[str, Dict[str, str]]]) -> None:
@@ -696,11 +693,10 @@ class MirrorSnapshotScheduleHandler:
             "MirrorSnapshotScheduleHandler: add_schedule: level_spec={}, interval={}, start_time={}".format(
                 level_spec.name, interval, start_time))
 
+        # TODO: optimize to rebuild only affected part of the queue
         with self.lock:
             self.schedules.add(level_spec, interval, start_time)
-
-        # TODO: optimize to rebuild only affected part of the queue
-        self.rebuild_queue()
+            self.rebuild_queue()
         return 0, "", ""
 
     def remove_schedule(self,
@@ -711,11 +707,10 @@ class MirrorSnapshotScheduleHandler:
             "MirrorSnapshotScheduleHandler: remove_schedule: level_spec={}, interval={}, start_time={}".format(
                 level_spec.name, interval, start_time))
 
+        # TODO: optimize to rebuild only affected part of the queue
         with self.lock:
             self.schedules.remove(level_spec, interval, start_time)
-
-        # TODO: optimize to rebuild only affected part of the queue
-        self.rebuild_queue()
+            self.rebuild_queue()
         return 0, "", ""
 
     def list(self, level_spec: LevelSpec) -> Tuple[int, str, str]:
index 1cb87d8a82cf0a256a05ca5ae0ed3cd44efa45d2..5f817bd8038870227b2b75372e8cdde13c6f2767 100644 (file)
@@ -72,8 +72,7 @@ class TrashPurgeScheduleHandler:
 
         schedules = Schedules(self)
         schedules.load()
-        with self.lock:
-            self.schedules = schedules
+        self.schedules = schedules
 
     def refresh_pools(self) -> float:
         elapsed = (datetime.now() - self.last_refresh_pools).total_seconds()
@@ -82,9 +81,8 @@ class TrashPurgeScheduleHandler:
 
         self.log.debug("TrashPurgeScheduleHandler: refresh_pools")
 
-        self.load_schedules()
-
         with self.lock:
+            self.load_schedules()
             if not self.schedules:
                 self.log.debug("TrashPurgeScheduleHandler: no schedules")
                 self.pools = {}
@@ -128,24 +126,23 @@ class TrashPurgeScheduleHandler:
             pools[pool_id][namespace] = pool_name
 
     def rebuild_queue(self) -> None:
-        with self.lock:
-            now = datetime.now()
+        now = datetime.now()
 
-            # don't remove from queue "due" images
-            now_string = datetime.strftime(now, "%Y-%m-%d %H:%M:00")
+        # don't remove from queue "due" images
+        now_string = datetime.strftime(now, "%Y-%m-%d %H:%M:00")
 
-            for schedule_time in list(self.queue):
-                if schedule_time > now_string:
-                    del self.queue[schedule_time]
+        for schedule_time in list(self.queue):
+            if schedule_time > now_string:
+                del self.queue[schedule_time]
 
-            if not self.schedules:
-                return
+        if not self.schedules:
+            return
 
-            for pool_id, namespaces in self.pools.items():
-                for namespace in namespaces:
-                    self.enqueue(now, pool_id, namespace)
+        for pool_id, namespaces in self.pools.items():
+            for namespace in namespaces:
+                self.enqueue(now, pool_id, namespace)
 
-            self.condition.notify()
+        self.condition.notify()
 
     def refresh_queue(self, current_pools: Dict[str, Dict[str, str]]) -> None:
         now = datetime.now()
@@ -222,11 +219,10 @@ class TrashPurgeScheduleHandler:
             "TrashPurgeScheduleHandler: add_schedule: level_spec={}, interval={}, start_time={}".format(
                 level_spec.name, interval, start_time))
 
+        # TODO: optimize to rebuild only affected part of the queue
         with self.lock:
             self.schedules.add(level_spec, interval, start_time)
-
-        # TODO: optimize to rebuild only affected part of the queue
-        self.rebuild_queue()
+            self.rebuild_queue()
         return 0, "", ""
 
     def remove_schedule(self,
@@ -237,11 +233,10 @@ class TrashPurgeScheduleHandler:
             "TrashPurgeScheduleHandler: remove_schedule: level_spec={}, interval={}, start_time={}".format(
                 level_spec.name, interval, start_time))
 
+        # TODO: optimize to rebuild only affected part of the queue
         with self.lock:
             self.schedules.remove(level_spec, interval, start_time)
-
-        # TODO: optimize to rebuild only affected part of the queue
-        self.rebuild_queue()
+            self.rebuild_queue()
         return 0, "", ""
 
     def list(self, level_spec: LevelSpec) -> Tuple[int, str, str]: