From ec57215d508e6cb5b3a4d84fd6a3a5b0c9b96c71 Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Tue, 16 Aug 2022 17:08:16 +0530 Subject: [PATCH] mgr/volumes: Better handle config file on osd full scenario The 'metadata_mgr.flush()' used to truncate the config file before flushing the new config data. This could lead to an empty config file when there is no space to write new config data. This patch handles this scenario by writing it to temporary file and rename it to config file. This would retain the config file without truncating it. Also, there are bunch of places which wasn't handling 'MetadataMgrException' because of this. Fixed those. Fixes: https://tracker.ceph.com/issues/55976 Signed-off-by: Kotresh HR --- src/pybind/mgr/volumes/fs/async_cloner.py | 11 +++--- .../operations/versions/metadata_manager.py | 8 +++-- .../fs/operations/versions/subvolume_base.py | 35 ++++++++++++++----- .../fs/operations/versions/subvolume_v1.py | 29 +++++++++------ .../fs/operations/versions/subvolume_v2.py | 4 +-- 5 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/pybind/mgr/volumes/fs/async_cloner.py b/src/pybind/mgr/volumes/fs/async_cloner.py index 5a1fcbfdf5d00..95f7d64e1b364 100644 --- a/src/pybind/mgr/volumes/fs/async_cloner.py +++ b/src/pybind/mgr/volumes/fs/async_cloner.py @@ -296,9 +296,10 @@ def start_clone_sm(fs_client, volspec, volname, index, groupname, subvolname, st current_state, next_state)) set_clone_state(fs_client, volspec, volname, groupname, subvolname, next_state) current_state = next_state - except VolumeException as ve: - log.error("clone failed for ({0}, {1}, {2}) (current_state: {3}, reason: {4})".format(volname, groupname,\ - subvolname, current_state, ve)) + except (MetadataMgrException, VolumeException) as e: + log.error(f"clone failed for ({volname}, {groupname}, {subvolname}) " + f"(current_state: {current_state}, reason: {e} {os.strerror(-e.args[0])})") + raise def clone(fs_client, volspec, volname, index, clone_path, state_table, should_cancel, snapshot_clone_delay): log.info("cloning to subvolume path: {0}".format(clone_path)) @@ -312,8 +313,8 @@ def clone(fs_client, volspec, volname, index, clone_path, state_table, should_ca log.info("starting clone: ({0}, {1}, {2})".format(volname, groupname, subvolname)) start_clone_sm(fs_client, volspec, volname, index, groupname, subvolname, state_table, should_cancel, snapshot_clone_delay) log.info("finished clone: ({0}, {1}, {2})".format(volname, groupname, subvolname)) - except VolumeException as ve: - log.error("clone failed for ({0}, {1}, {2}), reason: {3}".format(volname, groupname, subvolname, ve)) + except (MetadataMgrException, VolumeException) as e: + log.error(f"clone failed for ({volname}, {groupname}, {subvolname}), reason: {e} {os.strerror(-e.args[0])}") class Cloner(AsyncJobs): """ diff --git a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py index 3c536f8a9b4b5..764f4bc25751a 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/metadata_manager.py @@ -97,14 +97,16 @@ class MetadataManager(object): if len(self.config.items(section)) == 0: self.config.remove_section(section) - try: with _lock: - fd = self.fs.open(self.config_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, self.mode) + tmp_config_path = self.config_path + b'.tmp' + fd = self.fs.open(tmp_config_path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, self.mode) with _ConfigWriter(self.fs, fd) as cfg_writer: self.config.write(cfg_writer) cfg_writer.fsync() - log.info("wrote {0} bytes to config {1}".format(cfg_writer.wrote, self.config_path)) + self.fs.rename(tmp_config_path, self.config_path) + log.info(f"wrote {cfg_writer.wrote} bytes to config {tmp_config_path}") + log.info(f"Renamed {tmp_config_path} to config {self.config_path}") except cephfs.Error as e: raise MetadataMgrException(-e.args[0], e.args[1]) diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py index 9e3ca9e25b2ed..906b55e7b83df 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_base.py @@ -433,9 +433,14 @@ class SubvolumeBase(object): 'features': self.features, 'state': self.state.value} def set_user_metadata(self, keyname, value): - self.metadata_mgr.add_section(MetadataManager.USER_METADATA_SECTION) - self.metadata_mgr.update_section(MetadataManager.USER_METADATA_SECTION, keyname, str(value)) - self.metadata_mgr.flush() + try: + self.metadata_mgr.add_section(MetadataManager.USER_METADATA_SECTION) + self.metadata_mgr.update_section(MetadataManager.USER_METADATA_SECTION, keyname, str(value)) + self.metadata_mgr.flush() + except MetadataMgrException as me: + log.error(f"Failed to set user metadata key={keyname} value={value} on subvolume={self.subvol_name} " + f"group={self.group_name} reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") + raise VolumeException(-me.args[0], me.args[1]) def get_user_metadata(self, keyname): try: @@ -457,7 +462,9 @@ class SubvolumeBase(object): self.metadata_mgr.flush() except MetadataMgrException as me: if me.errno == -errno.ENOENT: - raise VolumeException(-errno.ENOENT, "subvolume metadata not does not exist") + raise VolumeException(-errno.ENOENT, "subvolume metadata does not exist") + log.error(f"Failed to remove user metadata key={keyname} on subvolume={self.subvol_name} " + f"group={self.group_name} reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") raise VolumeException(-me.args[0], me.args[1]) def get_snap_section_name(self, snapname): @@ -465,10 +472,16 @@ class SubvolumeBase(object): return section; def set_snapshot_metadata(self, snapname, keyname, value): - section = self.get_snap_section_name(snapname) - self.metadata_mgr.add_section(section) - self.metadata_mgr.update_section(section, keyname, str(value)) - self.metadata_mgr.flush() + try: + section = self.get_snap_section_name(snapname) + self.metadata_mgr.add_section(section) + self.metadata_mgr.update_section(section, keyname, str(value)) + self.metadata_mgr.flush() + except MetadataMgrException as me: + log.error(f"Failed to set snapshot metadata key={keyname} value={value} on snap={snapname} " + f"subvolume={self.subvol_name} group={self.group_name} " + f"reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") + raise VolumeException(-me.args[0], me.args[1]) def get_snapshot_metadata(self, snapname, keyname): try: @@ -476,6 +489,9 @@ class SubvolumeBase(object): except MetadataMgrException as me: if me.errno == -errno.ENOENT: raise VolumeException(-errno.ENOENT, "key '{0}' does not exist.".format(keyname)) + log.error(f"Failed to get snapshot metadata key={keyname} on snap={snapname} " + f"subvolume={self.subvol_name} group={self.group_name} " + f"reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") raise VolumeException(-me.args[0], me.args[1]) return value @@ -491,4 +507,7 @@ class SubvolumeBase(object): except MetadataMgrException as me: if me.errno == -errno.ENOENT: raise VolumeException(-errno.ENOENT, "snapshot metadata not does not exist") + log.error(f"Failed to remove snapshot metadata key={keyname} on snap={snapname} " + f"subvolume={self.subvol_name} group={self.group_name} " + f"reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") raise VolumeException(-me.args[0], me.args[1]) diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py index 019054c51465c..e2857cec3d184 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v1.py @@ -120,7 +120,7 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate): if isinstance(e, MetadataMgrException): log.error("metadata manager exception: {0}".format(e)) - e = VolumeException(-errno.EINVAL, "exception in subvolume metadata") + e = VolumeException(-errno.EINVAL, f"exception in subvolume metadata: {os.strerror(-e.args[0])}") elif isinstance(e, cephfs.Error): e = VolumeException(-e.args[0], e.args[1]) raise e @@ -141,12 +141,16 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate): self.metadata_mgr.flush() def add_clone_failure(self, errno, error_msg): - self.metadata_mgr.add_section(MetadataManager.CLONE_FAILURE_SECTION) - self.metadata_mgr.update_section(MetadataManager.CLONE_FAILURE_SECTION, - MetadataManager.CLONE_FAILURE_META_KEY_ERRNO, errno) - self.metadata_mgr.update_section(MetadataManager.CLONE_FAILURE_SECTION, - MetadataManager.CLONE_FAILURE_META_KEY_ERROR_MSG, error_msg) - self.metadata_mgr.flush() + try: + self.metadata_mgr.add_section(MetadataManager.CLONE_FAILURE_SECTION) + self.metadata_mgr.update_section(MetadataManager.CLONE_FAILURE_SECTION, + MetadataManager.CLONE_FAILURE_META_KEY_ERRNO, errno) + self.metadata_mgr.update_section(MetadataManager.CLONE_FAILURE_SECTION, + MetadataManager.CLONE_FAILURE_META_KEY_ERROR_MSG, error_msg) + self.metadata_mgr.flush() + except MetadataMgrException as me: + log.error(f"Failed to add clone failure status clone={self.subvol_name} group={self.group_name} " + f"reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") def create_clone(self, pool, source_volname, source_subvolume, snapname): subvolume_type = SubvolumeTypes.TYPE_CLONE @@ -192,7 +196,7 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate): if isinstance(e, MetadataMgrException): log.error("metadata manager exception: {0}".format(e)) - e = VolumeException(-errno.EINVAL, "exception in subvolume metadata") + e = VolumeException(-errno.EINVAL, f"exception in subvolume metadata: {os.strerror(-e.args[0])}") elif isinstance(e, cephfs.Error): e = VolumeException(-e.args[0], e.args[1]) raise e @@ -806,9 +810,14 @@ class SubvolumeV1(SubvolumeBase, SubvolumeTemplate): if self.has_pending_clones(snapname): raise VolumeException(-errno.EAGAIN, "snapshot '{0}' has pending clones".format(snapname)) snappath = self.snapshot_path(snapname) + try: + self.metadata_mgr.remove_section(self.get_snap_section_name(snapname)) + self.metadata_mgr.flush() + except MetadataMgrException as me: + log.error(f"Failed to remove snapshot metadata on snap={snapname} subvol={self.subvol_name} " + f"group={self.group_name} reason={me.args[1]}, errno:{-me.args[0]}, {os.strerror(-me.args[0])}") + raise VolumeException(-errno.EAGAIN, f"failed to remove snapshot metadata on snap={snapname} reason={me.args[0]} {me.args[1]}") rmsnap(self.fs, snappath) - self.metadata_mgr.remove_section(self.get_snap_section_name(snapname)) - self.metadata_mgr.flush() def snapshot_info(self, snapname): if is_inherited_snap(snapname): diff --git a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py index bfebd24e3bf22..31d5f2443c501 100644 --- a/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py +++ b/src/pybind/mgr/volumes/fs/operations/versions/subvolume_v2.py @@ -198,7 +198,7 @@ class SubvolumeV2(SubvolumeV1): if isinstance(e, MetadataMgrException): log.error("metadata manager exception: {0}".format(e)) - e = VolumeException(-errno.EINVAL, "exception in subvolume metadata") + e = VolumeException(-errno.EINVAL, f"exception in subvolume metadata: {os.strerror(-e.args[0])}") elif isinstance(e, cephfs.Error): e = VolumeException(-e.args[0], e.args[1]) raise e @@ -252,7 +252,7 @@ class SubvolumeV2(SubvolumeV1): if isinstance(e, MetadataMgrException): log.error("metadata manager exception: {0}".format(e)) - e = VolumeException(-errno.EINVAL, "exception in subvolume metadata") + e = VolumeException(-errno.EINVAL, f"exception in subvolume metadata: {os.strerror(-e.args[0])}") elif isinstance(e, cephfs.Error): e = VolumeException(-e.args[0], e.args[1]) raise e -- 2.39.5