]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rbd_support: recover from "double blocklisting"
authorRamana Raja <rraja@redhat.com>
Wed, 10 May 2023 18:37:44 +0000 (14:37 -0400)
committerRamana Raja <rraja@redhat.com>
Tue, 8 Aug 2023 20:26:19 +0000 (16:26 -0400)
Recover from being blocklisted while recovering from blocklisting.
When the rbd_support  module is being set up to recover from client
blocklisting, the module's new rados client connection can also get
blocklisted. Currently, this will cause the recovery to fail and
the module will remain inoperable. Instead, retry module recovery
when the new client gets blocklisted during the module setup in the
recovery thread.

Fixes: https://tracker.ceph.com/issues/59713
Signed-off-by: Ramana Raja <rraja@redhat.com>
(cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f)

Conflicts:
src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
src/pybind/mgr/rbd_support/module.py
src/pybind/mgr/rbd_support/perf.py
src/pybind/mgr/rbd_support/task.py
src/pybind/mgr/rbd_support/trash_purge_schedule.py
 - Above conflicts were due to commit e4a16e2
   ("mgr/rbd_support: add type annotation") not in pacific
 - Above conflicts were due to commit dcb51b0
   ("mgr/rbd_support: define commands using CLICommand") not in pacific

src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py
src/pybind/mgr/rbd_support/module.py
src/pybind/mgr/rbd_support/perf.py
src/pybind/mgr/rbd_support/task.py
src/pybind/mgr/rbd_support/trash_purge_schedule.py

index 9d59627e6f0e82cb7c702e33ef66cd96957676a9..1797c34e6d238ae77310269846717fff49eae9f0 100644 (file)
@@ -313,10 +313,11 @@ class MirrorSnapshotScheduleHandler:
         self.last_refresh_images = datetime(1970, 1, 1)
         self.create_snapshot_requests = CreateSnapshotRequests(self)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self):
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self):
index 3e2193702a242af7334262350773e4589710055f..f7f2101cfb3cf0b4d95f9ac80b76b45f51371808 100644 (file)
@@ -158,19 +158,26 @@ class Module(MgrModule):
     def __init__(self, *args, **kwargs):
         super(Module, self).__init__(*args, **kwargs)
         self.client_blocklisted = Event()
+        self.module_ready = False
+        self.init_handlers()
         self.recovery_thread = Thread(target=self.run)
         self.recovery_thread.start()
-        self.setup()
 
-    def setup(self):
-        self.log.info("starting setup")
-        # new client is created and registed in the MgrMap implicitly
-        # as 'rados' is a property attribute.
-        self.rados.wait_for_latest_osdmap()
+    def init_handlers(self):
         self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self)
         self.perf = PerfHandler(self)
         self.task = TaskHandler(self)
         self.trash_purge_schedule = TrashPurgeScheduleHandler(self)
+
+    def setup_handlers(self):
+        self.log.info("starting setup")
+        # new RADOS client is created and registered in the MgrMap
+        # implicitly here as 'rados' is a property attribute.
+        self.rados.wait_for_latest_osdmap()
+        self.mirror_snapshot_schedule.setup()
+        self.perf.setup()
+        self.task.setup()
+        self.trash_purge_schedule.setup()
         self.log.info("setup complete")
         self.module_ready = True
 
@@ -178,13 +185,18 @@ class Module(MgrModule):
         self.log.info("recovery thread starting")
         try:
             while True:
-                # block until rados client is blocklisted
-                self.client_blocklisted.wait()
-                self.log.info("restarting")
+                try:
+                    self.setup_handlers()
+                except (rados.ConnectionShutdown, rbd.ConnectionShutdown):
+                    self.log.exception("setup_handlers: client blocklisted")
+                    self.log.info("recovering from double blocklisting")
+                else:
+                    # block until RADOS client is blocklisted
+                    self.client_blocklisted.wait()
+                    self.log.info("recovering from blocklisting")
                 self.shutdown()
                 self.client_blocklisted.clear()
-                self.setup()
-                self.log.info("restarted")
+                self.init_handlers()
         except Exception as ex:
             self.log.fatal("Fatal runtime error: {}\n{}".format(
                 ex, traceback.format_exc()))
index d8f863fcb416ab707f4d254a91e2df11ce50405d..1a26119b12048b7b3120570d1cdbfd3e6f2a98aa 100644 (file)
@@ -90,6 +90,8 @@ class PerfHandler:
 
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self):
         self.thread.start()
 
     def shutdown(self):
index cfd37c7d158983ca5c73b68bbd0c09f2ab5c194b..ca3005c0c14a33f11e21f2e63560ba2fe90fccfe 100644 (file)
@@ -151,11 +151,12 @@ class TaskHandler:
         self.module = module
         self.log = module.log
 
-        with self.lock:
-            self.init_task_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self):
+        with self.lock:
+            self.init_task_queue()
         self.thread.start()
 
     @property
index 56ad24876088e891815085a69327536536bf79d1..cbbdb1321b2834a94b38f8cffb621445bb3d9a61 100644 (file)
@@ -25,10 +25,11 @@ class TrashPurgeScheduleHandler:
         self.log = module.log
         self.last_refresh_pools = datetime(1970, 1, 1)
 
-        self.init_schedule_queue()
-
         self.stop_thread = False
         self.thread = Thread(target=self.run)
+
+    def setup(self):
+        self.init_schedule_queue()
         self.thread.start()
 
     def shutdown(self):