]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd_support: recover from "double blocklisting" 51758/head
authorRamana Raja <rraja@redhat.com>
Wed, 10 May 2023 18:37:44 +0000 (14:37 -0400)
committerRamana Raja <rraja@redhat.com>
Thu, 25 May 2023 12:14:41 +0000 (08:14 -0400)
Recover from being blocklisted while recovering from blocklisting.
When the rbd_support  module is being set up to recover from client
blocklisting, the module's new rados client connection can also get
blocklisted. Currently, this will cause the recovery to fail and
the module will remain inoperable. Instead, retry module recovery
when the new client gets blocklisted during the module setup in the
recovery thread.

Fixes: https://tracker.ceph.com/issues/59713
Signed-off-by: Ramana Raja <rraja@redhat.com>
(cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f)

src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
src/pybind/mgr/rbd_support/module.py
src/pybind/mgr/rbd_support/perf.py
src/pybind/mgr/rbd_support/task.py
src/pybind/mgr/rbd_support/trash_purge_schedule.py

index 187d6e53971823097d694bd466512bc473647b6c..2e53fc1b28c5f3ef319ace06e75cf2510af0e4f3 100644 (file)
@@ -338,10 +338,11 @@ class MirrorSnapshotScheduleHandler:
         self.last_refresh_images = datetime(1970, 1, 1)
         self.create_snapshot_requests = CreateSnapshotRequests(self)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self) -> None:
index a190930ecc604b22e81500af95eefdcfc6f8d768..c856418f99246be20e64dcdcd73443801e8b942a 100644 (file)
@@ -83,19 +83,26 @@ class Module(MgrModule):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super(Module, self).__init__(*args, **kwargs)
         self.client_blocklisted = Event()
+        self.module_ready = False
+        self.init_handlers()
         self.recovery_thread = Thread(target=self.run)
         self.recovery_thread.start()
-        self.setup()
 
-    def setup(self) -> None:
-        self.log.info("starting setup")
-        # new client is created and registed in the MgrMap implicitly
-        # as 'rados' is a property attribute.
-        self.rados.wait_for_latest_osdmap()
+    def init_handlers(self) -> None:
         self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self)
         self.perf = PerfHandler(self)
         self.task = TaskHandler(self)
         self.trash_purge_schedule = TrashPurgeScheduleHandler(self)
+
+    def setup_handlers(self) -> None:
+        self.log.info("starting setup")
+        # new RADOS client is created and registered in the MgrMap
+        # implicitly here as 'rados' is a property attribute.
+        self.rados.wait_for_latest_osdmap()
+        self.mirror_snapshot_schedule.setup()
+        self.perf.setup()
+        self.task.setup()
+        self.trash_purge_schedule.setup()
         self.log.info("setup complete")
         self.module_ready = True
 
@@ -103,13 +110,18 @@ class Module(MgrModule):
         self.log.info("recovery thread starting")
         try:
             while True:
-                # block until rados client is blocklisted
-                self.client_blocklisted.wait()
-                self.log.info("restarting")
+                try:
+                    self.setup_handlers()
+                except (rados.ConnectionShutdown, rbd.ConnectionShutdown):
+                    self.log.exception("setup_handlers: client blocklisted")
+                    self.log.info("recovering from double blocklisting")
+                else:
+                    # block until RADOS client is blocklisted
+                    self.client_blocklisted.wait()
+                    self.log.info("recovering from blocklisting")
                 self.shutdown()
                 self.client_blocklisted.clear()
-                self.setup()
-                self.log.info("restarted")
+                self.init_handlers()
         except Exception as ex:
             self.log.fatal("Fatal runtime error: {}\n{}".format(
                 ex, traceback.format_exc()))
index 4bcf0a18c56b71c043d78d9d451c0dc44518c680..68cbbd3b5f48b42fa0fd61f4f05ae327965744df 100644 (file)
@@ -119,6 +119,8 @@ class PerfHandler:
 
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
         self.thread.start()
 
     def shutdown(self) -> None:
index f572cd5762f4c3a257054250ed5ba120d8406ae7..7d6b2cc1f3a2f8a77eca11ace4c880f9404ef3d5 100644 (file)
@@ -165,11 +165,12 @@ class TaskHandler:
         self.module = module
         self.log = module.log
 
-        with self.lock:
-            self.init_task_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        with self.lock:
+            self.init_task_queue()
         self.thread.start()
 
     @property
index 4b715ec97673d9f3eeda6c3c32c0fced97eecf26..b5ff9f125ab0e054b051be6f37bbafa028fecae2 100644 (file)
@@ -26,10 +26,11 @@ class TrashPurgeScheduleHandler:
         self.log = module.log
         self.last_refresh_pools = datetime(1970, 1, 1)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self) -> None:
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self) -> None: