]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/volumes: Fail subvolume removal if it's in progress
authorKotresh HR <khiremat@redhat.com>
Fri, 16 Jul 2021 10:28:37 +0000 (15:58 +0530)
committerKotresh HR <khiremat@redhat.com>
Fri, 30 Jul 2021 07:44:28 +0000 (13:14 +0530)
Removing an in-progress subvolume clone with force doesn't
remove the clone index (tracker). This results in the cloner
thread to stuck in loop trying to clone the deleted one.

This patch addresses the issue by not allowing the subvolume clone
to be removed if it's not complete/cancelled/failed even with force option.
It throws the error EAGAIN, asking the user to cancel the pending clone
and retry.

Fixes: https://tracker.ceph.com/issues/51707
Signed-off-by: Kotresh HR <khiremat@redhat.com>
qa/tasks/cephfs/test_volumes.py
src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py
src/pybind/mgr/volumes/fs/volume.py

index f5dc9fa5588fb094a8aeb2f6d396cd2b5583da6e..89cec3af35e16282e8f603825ad0a12279072e5e 100644 (file)
@@ -3700,6 +3700,59 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper):
         # verify trash dir is clean
         self._wait_for_trash_empty()
 
+    def test_subvolume_snapshot_in_complete_clone_rm(self):
+        """
+        Validates the removal of clone when it is not in 'complete|cancelled|failed' state.
+        The forceful removl of subvolume clone succeeds only if it's in any of the
+        'complete|cancelled|failed' states. It fails with EAGAIN in any other states.
+        """
+
+        subvolume = self._generate_random_subvolume_name()
+        snapshot = self._generate_random_snapshot_name()
+        clone = self._generate_random_clone_name()
+
+        # create subvolume
+        self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777")
+
+        # do some IO
+        self._do_subvolume_io(subvolume, number_of_files=64)
+
+        # snapshot subvolume
+        self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot)
+
+        # Insert delay at the beginning of snapshot clone
+        self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2)
+
+        # schedule a clone
+        self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone)
+
+        # Use --force since clone is not complete. Returns EAGAIN as clone is not either complete or cancelled.
+        try:
+            self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+        except CommandFailedError as ce:
+            if ce.exitstatus != errno.EAGAIN:
+                raise RuntimeError("invalid error code when trying to remove failed clone")
+        else:
+            raise RuntimeError("expected error when removing a failed clone")
+
+        # cancel on-going clone
+        self._fs_cmd("clone", "cancel", self.volname, clone)
+
+        # verify canceled state
+        self._check_clone_canceled(clone)
+
+        # clone removal should succeed after cancel
+        self._fs_cmd("subvolume", "rm", self.volname, clone, "--force")
+
+        # remove snapshot
+        self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot)
+
+        # remove subvolumes
+        self._fs_cmd("subvolume", "rm", self.volname, subvolume)
+
+        # verify trash dir is clean
+        self._wait_for_trash_empty()
+
     def test_subvolume_snapshot_clone_retain_suid_guid(self):
         subvolume = self._generate_random_subvolume_name()
         snapshot = self._generate_random_snapshot_name()
index 1dd6f3fe3aa82e6eb0d02ff45fd5c46d7845d368..ca92279b5c98291d3d1151b4714c1f5587325488 100644 (file)
@@ -148,7 +148,7 @@ class SubvolumeV2(SubvolumeV1):
                 raise VolumeException(-e.args[0], e.args[1])
         else:
             log.info("cleaning up subvolume with path: {0}".format(self.subvolname))
-            self.remove()
+            self.remove(internal_cleanup=True)
 
     def _set_incarnation_metadata(self, subvolume_type, qpath, initial_state):
         self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_TYPE, subvolume_type.value)
@@ -337,11 +337,25 @@ class SubvolumeV2(SubvolumeV1):
         except cephfs.Error as e:
             raise VolumeException(-e.args[0], e.args[1])
 
-    def remove(self, retainsnaps=False):
+    @staticmethod
+    def safe_to_remove_subvolume_clone(subvol_state):
+        # Both the STATE_FAILED and STATE_CANCELED are handled by 'handle_clone_failed' in the state
+        # machine which removes the entry from the index. Hence, it's safe to removed clone with
+        # force option for both.
+        acceptable_rm_clone_states = [SubvolumeStates.STATE_COMPLETE, SubvolumeStates.STATE_CANCELED,
+                                      SubvolumeStates.STATE_FAILED]
+        if subvol_state not in acceptable_rm_clone_states:
+            return False
+        return True
+
+    def remove(self, retainsnaps=False, internal_cleanup=False):
         if self.list_snapshots():
             if not retainsnaps:
                 raise VolumeException(-errno.ENOTEMPTY, "subvolume '{0}' has snapshots".format(self.subvolname))
         else:
+            if not internal_cleanup and not self.safe_to_remove_subvolume_clone(self.state):
+                raise VolumeException(-errno.EAGAIN,
+                                      "{0} clone in-progress -- please cancel the clone and retry".format(self.subvolname))
             if not self.has_pending_purges:
                 self.trash_base_dir()
                 # Delete the volume meta file, if it's not already deleted
index a55ca44f31bce4e2c40e1837d8662991e96f2223..a3496d94229f72ce172741f010635de7b1fe9802 100644 (file)
@@ -204,7 +204,7 @@ class VolumeClient(CephfsClient["Module"]):
                     # the purge threads on dump.
                     self.purge_queue.queue_job(volname)
         except VolumeException as ve:
-            if ve.errno == -errno.EAGAIN:
+            if ve.errno == -errno.EAGAIN and not force:
                 ve = VolumeException(ve.errno, ve.error_str + " (use --force to override)")
                 ret = self.volume_exception_to_retval(ve)
             elif not (ve.errno == -errno.ENOENT and force):