From: Ramana Raja Date: Wed, 10 May 2023 18:37:44 +0000 (-0400) Subject: rbd_support: recover from "double blocklisting" X-Git-Tag: v17.2.7~344^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2ce8275e4bdaf38d075b0c59287178dec0c5ef32;p=ceph.git rbd_support: recover from "double blocklisting" Recover from being blocklisted while recovering from blocklisting. When the rbd_support module is being set up to recover from client blocklisting, the module's new rados client connection can also get blocklisted. Currently, this will cause the recovery to fail and the module will remain inoperable. Instead, retry module recovery when the new client gets blocklisted during the module setup in the recovery thread. Fixes: https://tracker.ceph.com/issues/59713 Signed-off-by: Ramana Raja (cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f) --- diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py index 187d6e539718..2e53fc1b28c5 100644 --- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py +++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py @@ -338,10 +338,11 @@ class MirrorSnapshotScheduleHandler: self.last_refresh_images = datetime(1970, 1, 1) self.create_snapshot_requests = CreateSnapshotRequests(self) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/module.py b/src/pybind/mgr/rbd_support/module.py index a190930ecc60..c856418f9924 100644 --- a/src/pybind/mgr/rbd_support/module.py +++ b/src/pybind/mgr/rbd_support/module.py @@ -83,19 +83,26 @@ class Module(MgrModule): def __init__(self, *args: Any, **kwargs: Any) -> None: super(Module, self).__init__(*args, **kwargs) self.client_blocklisted = Event() + self.module_ready = False + self.init_handlers() self.recovery_thread = Thread(target=self.run) self.recovery_thread.start() - self.setup() - def setup(self) -> None: - self.log.info("starting setup") - # new client is created and registed in the MgrMap implicitly - # as 'rados' is a property attribute. - self.rados.wait_for_latest_osdmap() + def init_handlers(self) -> None: self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self) self.perf = PerfHandler(self) self.task = TaskHandler(self) self.trash_purge_schedule = TrashPurgeScheduleHandler(self) + + def setup_handlers(self) -> None: + self.log.info("starting setup") + # new RADOS client is created and registered in the MgrMap + # implicitly here as 'rados' is a property attribute. + self.rados.wait_for_latest_osdmap() + self.mirror_snapshot_schedule.setup() + self.perf.setup() + self.task.setup() + self.trash_purge_schedule.setup() self.log.info("setup complete") self.module_ready = True @@ -103,13 +110,18 @@ class Module(MgrModule): self.log.info("recovery thread starting") try: while True: - # block until rados client is blocklisted - self.client_blocklisted.wait() - self.log.info("restarting") + try: + self.setup_handlers() + except (rados.ConnectionShutdown, rbd.ConnectionShutdown): + self.log.exception("setup_handlers: client blocklisted") + self.log.info("recovering from double blocklisting") + else: + # block until RADOS client is blocklisted + self.client_blocklisted.wait() + self.log.info("recovering from blocklisting") self.shutdown() self.client_blocklisted.clear() - self.setup() - self.log.info("restarted") + self.init_handlers() except Exception as ex: self.log.fatal("Fatal runtime error: {}\n{}".format( ex, traceback.format_exc())) diff --git a/src/pybind/mgr/rbd_support/perf.py b/src/pybind/mgr/rbd_support/perf.py index 4bcf0a18c56b..68cbbd3b5f48 100644 --- a/src/pybind/mgr/rbd_support/perf.py +++ b/src/pybind/mgr/rbd_support/perf.py @@ -119,6 +119,8 @@ class PerfHandler: self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: self.thread.start() def shutdown(self) -> None: diff --git a/src/pybind/mgr/rbd_support/task.py b/src/pybind/mgr/rbd_support/task.py index f572cd5762f4..7d6b2cc1f3a2 100644 --- a/src/pybind/mgr/rbd_support/task.py +++ b/src/pybind/mgr/rbd_support/task.py @@ -165,11 +165,12 @@ class TaskHandler: self.module = module self.log = module.log - with self.lock: - self.init_task_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + with self.lock: + self.init_task_queue() self.thread.start() @property diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py index 4b715ec97673..b5ff9f125ab0 100644 --- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py +++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py @@ -26,10 +26,11 @@ class TrashPurgeScheduleHandler: self.log = module.log self.last_refresh_pools = datetime(1970, 1, 1) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self) -> None: + self.init_schedule_queue() self.thread.start() def shutdown(self) -> None: