From: Kotresh HR Date: Fri, 16 Jul 2021 10:28:37 +0000 (+0530) Subject: mgr/volumes: Fail subvolume removal if it's in progress X-Git-Tag: v17.1.0~1039^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=103c7bdc70ca5316dd9b6dbf67ff0a19e99dc9d6;p=ceph-ci.git mgr/volumes: Fail subvolume removal if it's in progress Removing an in-progress subvolume clone with force doesn't remove the clone index (tracker). This results in the cloner thread to stuck in loop trying to clone the deleted one. This patch addresses the issue by not allowing the subvolume clone to be removed if it's not complete/cancelled/failed even with force option. It throws the error EAGAIN, asking the user to cancel the pending clone and retry. Fixes: https://tracker.ceph.com/issues/51707 Signed-off-by: Kotresh HR --- diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py index f5dc9fa5588..89cec3af35e 100644 --- a/qa/tasks/cephfs/test_volumes.py +++ b/qa/tasks/cephfs/test_volumes.py @@ -3700,6 +3700,59 @@ class TestSubvolumeSnapshotClones(TestVolumesHelper): # verify trash dir is clean self._wait_for_trash_empty() + def test_subvolume_snapshot_in_complete_clone_rm(self): + """ + Validates the removal of clone when it is not in 'complete|cancelled|failed' state. + The forceful removl of subvolume clone succeeds only if it's in any of the + 'complete|cancelled|failed' states. It fails with EAGAIN in any other states. + """ + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # Use --force since clone is not complete. Returns EAGAIN as clone is not either complete or cancelled. + try: + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + except CommandFailedError as ce: + if ce.exitstatus != errno.EAGAIN: + raise RuntimeError("invalid error code when trying to remove failed clone") + else: + raise RuntimeError("expected error when removing a failed clone") + + # cancel on-going clone + self._fs_cmd("clone", "cancel", self.volname, clone) + + # verify canceled state + self._check_clone_canceled(clone) + + # clone removal should succeed after cancel + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + def test_subvolume_snapshot_clone_retain_suid_guid(self): subvolume = self._generate_random_subvolume_name() snapshot = self._generate_random_snapshot_name() diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py index 1dd6f3fe3aa..ca92279b5c9 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py @@ -148,7 +148,7 @@ class SubvolumeV2(SubvolumeV1): raise VolumeException(-e.args[0], e.args[1]) else: log.info("cleaning up subvolume with path: {0}".format(self.subvolname)) - self.remove() + self.remove(internal_cleanup=True) def _set_incarnation_metadata(self, subvolume_type, qpath, initial_state): self.metadata_mgr.update_global_section(MetadataManager.GLOBAL_META_KEY_TYPE, subvolume_type.value) @@ -337,11 +337,25 @@ class SubvolumeV2(SubvolumeV1): except cephfs.Error as e: raise VolumeException(-e.args[0], e.args[1]) - def remove(self, retainsnaps=False): + @staticmethod + def safe_to_remove_subvolume_clone(subvol_state): + # Both the STATE_FAILED and STATE_CANCELED are handled by 'handle_clone_failed' in the state + # machine which removes the entry from the index. Hence, it's safe to removed clone with + # force option for both. + acceptable_rm_clone_states = [SubvolumeStates.STATE_COMPLETE, SubvolumeStates.STATE_CANCELED, + SubvolumeStates.STATE_FAILED] + if subvol_state not in acceptable_rm_clone_states: + return False + return True + + def remove(self, retainsnaps=False, internal_cleanup=False): if self.list_snapshots(): if not retainsnaps: raise VolumeException(-errno.ENOTEMPTY, "subvolume '{0}' has snapshots".format(self.subvolname)) else: + if not internal_cleanup and not self.safe_to_remove_subvolume_clone(self.state): + raise VolumeException(-errno.EAGAIN, + "{0} clone in-progress -- please cancel the clone and retry".format(self.subvolname)) if not self.has_pending_purges: self.trash_base_dir() # Delete the volume meta file, if it's not already deleted diff --git a/src/pybind/mgr/volumes/fs/volume.py b/src/pybind/mgr/volumes/fs/volume.py index a55ca44f31b..a3496d94229 100644 --- a/src/pybind/mgr/volumes/fs/volume.py +++ b/src/pybind/mgr/volumes/fs/volume.py @@ -204,7 +204,7 @@ class VolumeClient(CephfsClient["Module"]): # the purge threads on dump. self.purge_queue.queue_job(volname) except VolumeException as ve: - if ve.errno == -errno.EAGAIN: + if ve.errno == -errno.EAGAIN and not force: ve = VolumeException(ve.errno, ve.error_str + " (use --force to override)") ret = self.volume_exception_to_retval(ve) elif not (ve.errno == -errno.ENOENT and force):