]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/rbd_support: avoid wedging the task queue if pool is removed 49056/head
authorIlya Dryomov <idryomov@gmail.com>
Wed, 23 Nov 2022 17:10:03 +0000 (18:10 +0100)
committerIlya Dryomov <idryomov@gmail.com>
Fri, 25 Nov 2022 11:22:32 +0000 (12:22 +0100)
rados.ObjectNotFound exception handler was referencing ioctx variable
which is assigned only if the pool exists and rados.open_ioctx() call
succeeds.  This lead to a fatal error

  mgr[rbd_support] Failed to locate pool mypool
  mgr[rbd_support] execute_task: [errno 2] error opening pool 'b'mypool''
  mgr[rbd_support] Fatal runtime error: local variable 'ioctx' referenced before assignment

and wedged the task queue.  No other commands were processed until
ceph-mgr daemon restart.

Fixes: https://tracker.ceph.com/issues/52932
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
(cherry picked from commit 5a425927ed4c0d0adee3773226ccae26c1c98d30)

Conflicts:
src/pybind/mgr/rbd_support/task.py [ commit e4a16e261370
  ("mgr/rbd_support: add type annotation") not in pacific ]

qa/workunits/rbd/cli_generic.sh
src/pybind/mgr/rbd_support/task.py

index 41f721f61829612c7b1ff72e9b5c0e185f7514a4..58b26dd2d98758ea6428bbbf2de95e0c0fd43c1f 100755 (executable)
@@ -1453,6 +1453,55 @@ test_mirror_pool_peer_bootstrap_create() {
     ceph osd pool rm rbd1 rbd1 --yes-i-really-really-mean-it
 }
 
+test_tasks_removed_pool() {
+    echo "testing removing pool under running tasks..."
+    remove_images
+
+    ceph osd pool create rbd2 8
+    rbd pool init rbd2
+
+    rbd create $RBD_CREATE_ARGS --size 1G foo
+    rbd snap create foo@snap
+    rbd snap protect foo@snap
+    rbd clone foo@snap bar
+
+    rbd create $RBD_CREATE_ARGS --size 1G rbd2/dummy
+    rbd bench --io-type write --io-pattern seq --io-size 1M --io-total 1G rbd2/dummy
+    rbd snap create rbd2/dummy@snap
+    rbd snap protect rbd2/dummy@snap
+    for i in {1..5}; do
+        rbd clone rbd2/dummy@snap rbd2/dummy$i
+    done
+
+    # queue flattens on a few dummy images and remove that pool
+    test "$(ceph rbd task list)" = "[]"
+    for i in {1..5}; do
+        ceph rbd task add flatten rbd2/dummy$i
+    done
+    ceph osd pool delete rbd2 rbd2 --yes-i-really-really-mean-it
+    test "$(ceph rbd task list)" != "[]"
+
+    # queue flatten on another image and check that it completes
+    rbd info bar | grep 'parent: '
+    expect_fail rbd snap unprotect foo@snap
+    ceph rbd task add flatten bar
+    for i in {1..12}; do
+        rbd info bar | grep 'parent: ' || break
+        sleep 10
+    done
+    rbd info bar | expect_fail grep 'parent: '
+    rbd snap unprotect foo@snap
+
+    # check that flattens disrupted by pool removal are cleaned up
+    for i in {1..12}; do
+        test "$(ceph rbd task list)" = "[]" && break
+        sleep 10
+    done
+    test "$(ceph rbd task list)" = "[]"
+
+    remove_images
+}
+
 test_pool_image_args
 test_rename
 test_ls
@@ -1477,5 +1526,6 @@ test_trash_purge_schedule
 test_mirror_snapshot_schedule
 test_perf_image_iostat
 test_mirror_pool_peer_bootstrap_create
+test_tasks_removed_pool
 
 echo OK
index 87d43eca15a6e97bd103d088df2a793614e6d18b..ff096fd9bd5e3a1ffe1a6be01bb6ebda43d3987a 100644 (file)
@@ -328,13 +328,14 @@ class TaskHandler:
 
     def remove_task(self, ioctx, task, remove_in_memory=True):
         self.log.info("remove_task: task={}".format(str(task)))
-        omap_keys = (task.sequence_key, )
-        try:
-            with rados.WriteOpCtx() as write_op:
-                ioctx.remove_omap_keys(write_op, omap_keys)
-                ioctx.operate_write_op(write_op, RBD_TASK_OID)
-        except rados.ObjectNotFound:
-            pass
+        if ioctx:
+            try:
+                with rados.WriteOpCtx() as write_op:
+                    omap_keys = (task.sequence_key, )
+                    ioctx.remove_omap_keys(write_op, omap_keys)
+                    ioctx.operate_write_op(write_op, RBD_TASK_OID)
+            except rados.ObjectNotFound:
+                pass
 
         if remove_in_memory:
             try:
@@ -398,9 +399,9 @@ class TaskHandler:
                 task.retry_message = "{}".format(e)
                 self.update_progress(task, 0)
             else:
-                # pool DNE -- remove the task
+                # pool DNE -- remove in-memory task
                 self.complete_progress(task)
-                self.remove_task(ioctx, task)
+                self.remove_task(None, task)
 
         except (rados.Error, rbd.Error) as e:
             self.log.error("execute_task: {}".format(e))