From 1ed680eaff235625c0a452e93b18a3d5e45eb289 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 23 Nov 2022 18:10:03 +0100 Subject: [PATCH] mgr/rbd_support: avoid wedging the task queue if pool is removed rados.ObjectNotFound exception handler was referencing ioctx variable which is assigned only if the pool exists and rados.open_ioctx() call succeeds. This lead to a fatal error mgr[rbd_support] Failed to locate pool mypool mgr[rbd_support] execute_task: [errno 2] error opening pool 'b'mypool'' mgr[rbd_support] Fatal runtime error: local variable 'ioctx' referenced before assignment and wedged the task queue. No other commands were processed until ceph-mgr daemon restart. Fixes: https://tracker.ceph.com/issues/52932 Signed-off-by: Ilya Dryomov (cherry picked from commit 5a425927ed4c0d0adee3773226ccae26c1c98d30) Conflicts: src/pybind/mgr/rbd_support/task.py [ commit e4a16e261370 ("mgr/rbd_support: add type annotation") not in pacific ] --- qa/workunits/rbd/cli_generic.sh | 50 ++++++++++++++++++++++++++++++ src/pybind/mgr/rbd_support/task.py | 19 ++++++------ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/qa/workunits/rbd/cli_generic.sh b/qa/workunits/rbd/cli_generic.sh index 41f721f618296..58b26dd2d9875 100755 --- a/qa/workunits/rbd/cli_generic.sh +++ b/qa/workunits/rbd/cli_generic.sh @@ -1453,6 +1453,55 @@ test_mirror_pool_peer_bootstrap_create() { ceph osd pool rm rbd1 rbd1 --yes-i-really-really-mean-it } +test_tasks_removed_pool() { + echo "testing removing pool under running tasks..." + remove_images + + ceph osd pool create rbd2 8 + rbd pool init rbd2 + + rbd create $RBD_CREATE_ARGS --size 1G foo + rbd snap create foo@snap + rbd snap protect foo@snap + rbd clone foo@snap bar + + rbd create $RBD_CREATE_ARGS --size 1G rbd2/dummy + rbd bench --io-type write --io-pattern seq --io-size 1M --io-total 1G rbd2/dummy + rbd snap create rbd2/dummy@snap + rbd snap protect rbd2/dummy@snap + for i in {1..5}; do + rbd clone rbd2/dummy@snap rbd2/dummy$i + done + + # queue flattens on a few dummy images and remove that pool + test "$(ceph rbd task list)" = "[]" + for i in {1..5}; do + ceph rbd task add flatten rbd2/dummy$i + done + ceph osd pool delete rbd2 rbd2 --yes-i-really-really-mean-it + test "$(ceph rbd task list)" != "[]" + + # queue flatten on another image and check that it completes + rbd info bar | grep 'parent: ' + expect_fail rbd snap unprotect foo@snap + ceph rbd task add flatten bar + for i in {1..12}; do + rbd info bar | grep 'parent: ' || break + sleep 10 + done + rbd info bar | expect_fail grep 'parent: ' + rbd snap unprotect foo@snap + + # check that flattens disrupted by pool removal are cleaned up + for i in {1..12}; do + test "$(ceph rbd task list)" = "[]" && break + sleep 10 + done + test "$(ceph rbd task list)" = "[]" + + remove_images +} + test_pool_image_args test_rename test_ls @@ -1477,5 +1526,6 @@ test_trash_purge_schedule test_mirror_snapshot_schedule test_perf_image_iostat test_mirror_pool_peer_bootstrap_create +test_tasks_removed_pool echo OK diff --git a/src/pybind/mgr/rbd_support/task.py b/src/pybind/mgr/rbd_support/task.py index 87d43eca15a6e..ff096fd9bd5e3 100644 --- a/src/pybind/mgr/rbd_support/task.py +++ b/src/pybind/mgr/rbd_support/task.py @@ -328,13 +328,14 @@ class TaskHandler: def remove_task(self, ioctx, task, remove_in_memory=True): self.log.info("remove_task: task={}".format(str(task))) - omap_keys = (task.sequence_key, ) - try: - with rados.WriteOpCtx() as write_op: - ioctx.remove_omap_keys(write_op, omap_keys) - ioctx.operate_write_op(write_op, RBD_TASK_OID) - except rados.ObjectNotFound: - pass + if ioctx: + try: + with rados.WriteOpCtx() as write_op: + omap_keys = (task.sequence_key, ) + ioctx.remove_omap_keys(write_op, omap_keys) + ioctx.operate_write_op(write_op, RBD_TASK_OID) + except rados.ObjectNotFound: + pass if remove_in_memory: try: @@ -398,9 +399,9 @@ class TaskHandler: task.retry_message = "{}".format(e) self.update_progress(task, 0) else: - # pool DNE -- remove the task + # pool DNE -- remove in-memory task self.complete_progress(task) - self.remove_task(ioctx, task) + self.remove_task(None, task) except (rados.Error, rbd.Error) as e: self.log.error("execute_task: {}".format(e)) -- 2.39.5