]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd_support: recover from "double blocklisting"
authorRamana Raja <rraja@redhat.com>
Wed, 10 May 2023 18:37:44 +0000 (14:37 -0400)
committerIlya Dryomov <idryomov@gmail.com>
Tue, 16 May 2023 20:18:18 +0000 (22:18 +0200)
Recover from being blocklisted while recovering from blocklisting.
When the rbd_support  module is being set up to recover from client
blocklisting, the module's new rados client connection can also get
blocklisted. Currently, this will cause the recovery to fail and
the module will remain inoperable. Instead, retry module recovery
when the new client gets blocklisted during the module setup in the
recovery thread.

Fixes: https://tracker.ceph.com/issues/59713
Signed-off-by: Ramana Raja <rraja@redhat.com>
(cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f)

src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
src/pybind/mgr/rbd_support/module.py
src/pybind/mgr/rbd_support/perf.py
src/pybind/mgr/rbd_support/task.py
src/pybind/mgr/rbd_support/trash_purge_schedule.py

index 921a65c094536ebe7da3c582196056173dd88d5e..0bae891c67aee6db6cb689475540e7fda53b043e 100644 (file)
@@ -338,10 +338,11 @@ class MirrorSnapshotScheduleHandler:
         self.last_refresh_images = datetime(1970, 1, 1)
         self.create_snapshot_requests = CreateSnapshotRequests(self)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self) -> None:
index a190930ecc604b22e81500af95eefdcfc6f8d768..c856418f99246be20e64dcdcd73443801e8b942a 100644 (file)
@@ -83,19 +83,26 @@ class Module(MgrModule):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super(Module, self).__init__(*args, **kwargs)
         self.client_blocklisted = Event()
+        self.module_ready = False
+        self.init_handlers()
         self.recovery_thread = Thread(target=self.run)
         self.recovery_thread.start()
-        self.setup()
 
-    def setup(self) -> None:
-        self.log.info("starting setup")
-        # new client is created and registed in the MgrMap implicitly
-        # as 'rados' is a property attribute.
-        self.rados.wait_for_latest_osdmap()
+    def init_handlers(self) -> None:
         self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self)
         self.perf = PerfHandler(self)
         self.task = TaskHandler(self)
         self.trash_purge_schedule = TrashPurgeScheduleHandler(self)
+
+    def setup_handlers(self) -> None:
+        self.log.info("starting setup")
+        # new RADOS client is created and registered in the MgrMap
+        # implicitly here as 'rados' is a property attribute.
+        self.rados.wait_for_latest_osdmap()
+        self.mirror_snapshot_schedule.setup()
+        self.perf.setup()
+        self.task.setup()
+        self.trash_purge_schedule.setup()
         self.log.info("setup complete")
         self.module_ready = True
 
@@ -103,13 +110,18 @@ class Module(MgrModule):
         self.log.info("recovery thread starting")
         try:
             while True:
-                # block until rados client is blocklisted
-                self.client_blocklisted.wait()
-                self.log.info("restarting")
+                try:
+                    self.setup_handlers()
+                except (rados.ConnectionShutdown, rbd.ConnectionShutdown):
+                    self.log.exception("setup_handlers: client blocklisted")
+                    self.log.info("recovering from double blocklisting")
+                else:
+                    # block until RADOS client is blocklisted
+                    self.client_blocklisted.wait()
+                    self.log.info("recovering from blocklisting")
                 self.shutdown()
                 self.client_blocklisted.clear()
-                self.setup()
-                self.log.info("restarted")
+                self.init_handlers()
         except Exception as ex:
             self.log.fatal("Fatal runtime error: {}\n{}".format(
                 ex, traceback.format_exc()))
index 4bcf0a18c56b71c043d78d9d451c0dc44518c680..68cbbd3b5f48b42fa0fd61f4f05ae327965744df 100644 (file)
@@ -119,6 +119,8 @@ class PerfHandler:
 
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
         self.thread.start()
 
     def shutdown(self) -> None:
index 7dba510baa781d415c996d021ae2770e279ee92e..101d480dc1d7aafc4fd0ff70276b8020f535a2eb 100644 (file)
@@ -166,11 +166,12 @@ class TaskHandler:
         self.module = module
         self.log = module.log
 
-        with self.lock:
-            self.init_task_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        with self.lock:
+            self.init_task_queue()
         self.thread.start()
 
     @property
index 9cb349fcac0e7f729f43db07cf0862dc62a73766..b2f7b1614f1325e3964d206ae4d84d0428be60ef 100644 (file)
@@ -24,10 +24,11 @@ class TrashPurgeScheduleHandler:
         self.log = module.log
         self.last_refresh_pools = datetime(1970, 1, 1)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self) -> None: