From 4523d9b68ee84f69e8665a728d4037b53cdf3d6f Mon Sep 17 00:00:00 2001 From: Ramana Raja Date: Wed, 10 May 2023 14:37:44 -0400 Subject: [PATCH] rbd_support: recover from "double blocklisting" Recover from being blocklisted while recovering from blocklisting. When the rbd_support module is being set up to recover from client blocklisting, the module's new rados client connection can also get blocklisted. Currently, this will cause the recovery to fail and the module will remain inoperable. Instead, retry module recovery when the new client gets blocklisted during the module setup in the recovery thread. Fixes: https://tracker.ceph.com/issues/59713 Signed-off-by: Ramana Raja --- .../rbd_support/mirror_snapshot_schedule.py | 5 +-- src/pybind/mgr/rbd_support/module.py | 34 +++++++++++++------ src/pybind/mgr/rbd_support/perf.py | 2 ++ src/pybind/mgr/rbd_support/task.py | 7 ++-- .../mgr/rbd_support/trash_purge_schedule.py | 5 +-- 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py index 921a65c094536..0bae891c67aee 100644 --- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py +++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py @@ -338,10 +338,11 @@ class MirrorSnapshotScheduleHandler: self.last_refresh_images = datetime(1970, 1, 1) self.create_snapshot_requests = CreateSnapshotRequests(self) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/module.py b/src/pybind/mgr/rbd_support/module.py index a190930ecc604..c856418f99246 100644 --- a/src/pybind/mgr/rbd_support/module.py +++ b/src/pybind/mgr/rbd_support/module.py @@ -83,19 +83,26 @@ class Module(MgrModule): def __init__(self, *args: Any, **kwargs: Any) -> None: super(Module, self).__init__(*args, **kwargs) self.client_blocklisted = Event() + self.module_ready = False + self.init_handlers() self.recovery_thread = Thread(target=self.run) self.recovery_thread.start() - self.setup() - def setup(self) -> None: - self.log.info("starting setup") - # new client is created and registed in the MgrMap implicitly - # as 'rados' is a property attribute. - self.rados.wait_for_latest_osdmap() + def init_handlers(self) -> None: self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self) self.perf = PerfHandler(self) self.task = TaskHandler(self) self.trash_purge_schedule = TrashPurgeScheduleHandler(self) + + def setup_handlers(self) -> None: + self.log.info("starting setup") + # new RADOS client is created and registered in the MgrMap + # implicitly here as 'rados' is a property attribute. + self.rados.wait_for_latest_osdmap() + self.mirror_snapshot_schedule.setup() + self.perf.setup() + self.task.setup() + self.trash_purge_schedule.setup() self.log.info("setup complete") self.module_ready = True @@ -103,13 +110,18 @@ class Module(MgrModule): self.log.info("recovery thread starting") try: while True: - # block until rados client is blocklisted - self.client_blocklisted.wait() - self.log.info("restarting") + try: + self.setup_handlers() + except (rados.ConnectionShutdown, rbd.ConnectionShutdown): + self.log.exception("setup_handlers: client blocklisted") + self.log.info("recovering from double blocklisting") + else: + # block until RADOS client is blocklisted + self.client_blocklisted.wait() + self.log.info("recovering from blocklisting") self.shutdown() self.client_blocklisted.clear() - self.setup() - self.log.info("restarted") + self.init_handlers() except Exception as ex: self.log.fatal("Fatal runtime error: {}\n{}".format( ex, traceback.format_exc())) diff --git a/src/pybind/mgr/rbd_support/perf.py b/src/pybind/mgr/rbd_support/perf.py index 4bcf0a18c56b7..68cbbd3b5f48b 100644 --- a/src/pybind/mgr/rbd_support/perf.py +++ b/src/pybind/mgr/rbd_support/perf.py @@ -119,6 +119,8 @@ class PerfHandler: self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/task.py b/src/pybind/mgr/rbd_support/task.py index 7dba510baa781..101d480dc1d7a 100644 --- a/src/pybind/mgr/rbd_support/task.py +++ b/src/pybind/mgr/rbd_support/task.py @@ -166,11 +166,12 @@ class TaskHandler: self.module = module self.log = module.log - with self.lock: - self.init_task_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + with self.lock: + self.init_task_queue() self.thread.start() @property diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py index 9cb349fcac0e7..b2f7b1614f132 100644 --- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py +++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py @@ -24,10 +24,11 @@ class TrashPurgeScheduleHandler: self.log = module.log self.last_refresh_pools = datetime(1970, 1, 1) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: -- 2.47.3