From: Ramana Raja Date: Wed, 10 May 2023 18:37:44 +0000 (-0400) Subject: rbd_support: recover from "double blocklisting" X-Git-Tag: v16.2.14~11^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=18e1449e5b953aacf68c275f2ad28baa297bda02;p=ceph.git rbd_support: recover from "double blocklisting" Recover from being blocklisted while recovering from blocklisting. When the rbd_support module is being set up to recover from client blocklisting, the module's new rados client connection can also get blocklisted. Currently, this will cause the recovery to fail and the module will remain inoperable. Instead, retry module recovery when the new client gets blocklisted during the module setup in the recovery thread. Fixes: https://tracker.ceph.com/issues/59713 Signed-off-by: Ramana Raja (cherry picked from commit 4523d9b68ee84f69e8665a728d4037b53cdf3d6f) Conflicts: src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py src/pybind/mgr/rbd_support/module.py src/pybind/mgr/rbd_support/perf.py src/pybind/mgr/rbd_support/task.py src/pybind/mgr/rbd_support/trash_purge_schedule.py - Above conflicts were due to commit e4a16e2 ("mgr/rbd_support: add type annotation") not in pacific - Above conflicts were due to commit dcb51b0 ("mgr/rbd_support: define commands using CLICommand") not in pacific --- diff --git a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py index 9d59627e6f0e..1797c34e6d23 100644 --- a/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py +++ b/src/pybind/mgr/rbd_support/mirror_snapshot_schedule.py @@ -313,10 +313,11 @@ class MirrorSnapshotScheduleHandler: self.last_refresh_images = datetime(1970, 1, 1) self.create_snapshot_requests = CreateSnapshotRequests(self) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self): + self.init_schedule_queue() self.thread.start() def shutdown(self): diff --git a/src/pybind/mgr/rbd_support/module.py b/src/pybind/mgr/rbd_support/module.py index 3e2193702a24..f7f2101cfb3c 100644 --- a/src/pybind/mgr/rbd_support/module.py +++ b/src/pybind/mgr/rbd_support/module.py @@ -158,19 +158,26 @@ class Module(MgrModule): def __init__(self, *args, **kwargs): super(Module, self).__init__(*args, **kwargs) self.client_blocklisted = Event() + self.module_ready = False + self.init_handlers() self.recovery_thread = Thread(target=self.run) self.recovery_thread.start() - self.setup() - def setup(self): - self.log.info("starting setup") - # new client is created and registed in the MgrMap implicitly - # as 'rados' is a property attribute. - self.rados.wait_for_latest_osdmap() + def init_handlers(self): self.mirror_snapshot_schedule = MirrorSnapshotScheduleHandler(self) self.perf = PerfHandler(self) self.task = TaskHandler(self) self.trash_purge_schedule = TrashPurgeScheduleHandler(self) + + def setup_handlers(self): + self.log.info("starting setup") + # new RADOS client is created and registered in the MgrMap + # implicitly here as 'rados' is a property attribute. + self.rados.wait_for_latest_osdmap() + self.mirror_snapshot_schedule.setup() + self.perf.setup() + self.task.setup() + self.trash_purge_schedule.setup() self.log.info("setup complete") self.module_ready = True @@ -178,13 +185,18 @@ class Module(MgrModule): self.log.info("recovery thread starting") try: while True: - # block until rados client is blocklisted - self.client_blocklisted.wait() - self.log.info("restarting") + try: + self.setup_handlers() + except (rados.ConnectionShutdown, rbd.ConnectionShutdown): + self.log.exception("setup_handlers: client blocklisted") + self.log.info("recovering from double blocklisting") + else: + # block until RADOS client is blocklisted + self.client_blocklisted.wait() + self.log.info("recovering from blocklisting") self.shutdown() self.client_blocklisted.clear() - self.setup() - self.log.info("restarted") + self.init_handlers() except Exception as ex: self.log.fatal("Fatal runtime error: {}\n{}".format( ex, traceback.format_exc())) diff --git a/src/pybind/mgr/rbd_support/perf.py b/src/pybind/mgr/rbd_support/perf.py index d8f863fcb416..1a26119b1204 100644 --- a/src/pybind/mgr/rbd_support/perf.py +++ b/src/pybind/mgr/rbd_support/perf.py @@ -90,6 +90,8 @@ class PerfHandler: self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self): self.thread.start() def shutdown(self): diff --git a/src/pybind/mgr/rbd_support/task.py b/src/pybind/mgr/rbd_support/task.py index cfd37c7d1589..ca3005c0c14a 100644 --- a/src/pybind/mgr/rbd_support/task.py +++ b/src/pybind/mgr/rbd_support/task.py @@ -151,11 +151,12 @@ class TaskHandler: self.module = module self.log = module.log - with self.lock: - self.init_task_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self): + with self.lock: + self.init_task_queue() self.thread.start() @property diff --git a/src/pybind/mgr/rbd_support/trash_purge_schedule.py b/src/pybind/mgr/rbd_support/trash_purge_schedule.py index 56ad24876088..cbbdb1321b28 100644 --- a/src/pybind/mgr/rbd_support/trash_purge_schedule.py +++ b/src/pybind/mgr/rbd_support/trash_purge_schedule.py @@ -25,10 +25,11 @@ class TrashPurgeScheduleHandler: self.log = module.log self.last_refresh_pools = datetime(1970, 1, 1) - self.init_schedule_queue() - self.stop_thread = False self.thread = Thread(target=self.run) + + def setup(self): + self.init_schedule_queue() self.thread.start() def shutdown(self):