From: Ramana Raja Date: Wed, 10 May 2023 18:37:44 +0000 (-0400) Subject: rbd_support: recover from "double blocklisting" X-Git-Tag: v18.1.0~95^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e203a000e8abbb0a4272a14b704cd0d359550279;p=ceph.git rbd_support: recover from "double blocklisting" Recover from being blocklisted while recovering from blocklisting. When the rbd_support module is being set up to recover from client blocklisting, the module's new rados client connection can also get blocklisted. Currently, this will cause the recovery to fail and the module will remain inoperable. Instead, retry module recovery when the new client gets blocklisted during the module setup in the recovery thread. Fixes: https://tracker.ceph.com/issues/59713 Signed-off-by: Ramana Raja (cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f) --- diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py index 921a65c0945..0bae891c67a 100644 --- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py +++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py @@ -338,10 +338,11 @@ class MirrorSnapshotScheduleHandler: self.last_refresh_images = datetime(1970, 1, 1) self.create_snapshot_requests = CreateSnapshotRequests(self) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/module.py b/src/pybind/mgr/rbd_support/module.py index a190930ecc6..c856418f992 100644 --- a/src/pybind/mgr/rbd_support/module.py +++ b/src/pybind/mgr/rbd_support/module.py @@ -83,19 +83,26 @@ class Module(MgrModule): def __init__(self, *args: Any, **kwargs: Any) -> None: super(Module, self).__init__(*args, **kwargs) self.client_blocklisted = Event() + self.module_ready = False + self.init_handlers() self.recovery_thread = Thread(target=self.run) self.recovery_thread.start() - self.setup() - def setup(self) -> None: - self.log.info("starting setup") - # new client is created and registed in the MgrMap implicitly - # as 'rados' is a property attribute. - self.rados.wait_for_latest_osdmap() + def init_handlers(self) -> None: self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self) self.perf = PerfHandler(self) self.task = TaskHandler(self) self.trash_purge_schedule = TrashPurgeScheduleHandler(self) + + def setup_handlers(self) -> None: + self.log.info("starting setup") + # new RADOS client is created and registered in the MgrMap + # implicitly here as 'rados' is a property attribute. + self.rados.wait_for_latest_osdmap() + self.mirror_snapshot_schedule.setup() + self.perf.setup() + self.task.setup() + self.trash_purge_schedule.setup() self.log.info("setup complete") self.module_ready = True @@ -103,13 +110,18 @@ class Module(MgrModule): self.log.info("recovery thread starting") try: while True: - # block until rados client is blocklisted - self.client_blocklisted.wait() - self.log.info("restarting") + try: + self.setup_handlers() + except (rados.ConnectionShutdown, rbd.ConnectionShutdown): + self.log.exception("setup_handlers: client blocklisted") + self.log.info("recovering from double blocklisting") + else: + # block until RADOS client is blocklisted + self.client_blocklisted.wait() + self.log.info("recovering from blocklisting") self.shutdown() self.client_blocklisted.clear() - self.setup() - self.log.info("restarted") + self.init_handlers() except Exception as ex: self.log.fatal("Fatal runtime error: {}\n{}".format( ex, traceback.format_exc())) diff --git a/src/pybind/mgr/rbd_support/perf.py b/src/pybind/mgr/rbd_support/perf.py index 4bcf0a18c56..68cbbd3b5f4 100644 --- a/src/pybind/mgr/rbd_support/perf.py +++ b/src/pybind/mgr/rbd_support/perf.py @@ -119,6 +119,8 @@ class PerfHandler: self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/task.py b/src/pybind/mgr/rbd_support/task.py index 7dba510baa7..101d480dc1d 100644 --- a/src/pybind/mgr/rbd_support/task.py +++ b/src/pybind/mgr/rbd_support/task.py @@ -166,11 +166,12 @@ class TaskHandler: self.module = module self.log = module.log - with self.lock: - self.init_task_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + with self.lock: + self.init_task_queue() self.thread.start() @property diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py index 9cb349fcac0..b2f7b1614f1 100644 --- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py +++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py @@ -24,10 +24,11 @@ class TrashPurgeScheduleHandler: self.log = module.log self.last_refresh_pools = datetime(1970, 1, 1) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: