From b0da2c37d24c89f4e80b68fd335cdd896e5fc17d Mon Sep 17 00:00:00 2001 From: Rishabh Dave Date: Tue, 18 Jun 2024 00:33:28 +0530 Subject: [PATCH] mgr/vol: handle case where clone index entry goes missing In `async_cloner.py`, clone index entry is fetched to get next clone job that needs to be executed. It might happen that the clone job was cancelled just when it was going to be picked for execution (IOW, when it was about to move from pending state to in-progress state). Currently, MGR hangs in such a case because exception `ObjectNotFound` from CephFS Python bindings is raised and is left uncaught. To prevent this issue catch the exception, log it and return None to tell `get_job()` of `async_job.py` to look for next job in the queue. Increase the scope of try-except in method `get_oldest_clone_entry()` of `async_cloner.py` so that when exception `cephfs.Error` or any exception under it is thrown by `self.fs.lstat()` is not left uncaught. FS object is also passed to the method `list_one_entry_at_a_time()`, so increasing scope of try-except is useful as it will not allow exceptions raised in other calls to CephFS Python binding methods to be left uncaught. Fixes: https://tracker.ceph.com/issues/66560 Signed-off-by: Rishabh Dave (cherry picked from commit 3cff7251c86a4670768721f924b11b3de33f807b) --- .../mgr/volumes/fs/operations/clone_index.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/pybind/mgr/volumes/fs/operations/clone_index.py b/src/pybind/mgr/volumes/fs/operations/clone_index.py index f5a850638d8f5..1f16500a6bd85 100644 --- a/src/pybind/mgr/volumes/fs/operations/clone_index.py +++ b/src/pybind/mgr/volumes/fs/operations/clone_index.py @@ -48,24 +48,26 @@ class CloneIndex(Index): raise IndexException(-e.args[0], e.args[1]) def get_oldest_clone_entry(self, exclude=[]): - min_ctime_entry = None - exclude_tracking_ids = [v[0] for v in exclude] - log.debug("excluded tracking ids: {0}".format(exclude_tracking_ids)) - for entry in list_one_entry_at_a_time(self.fs, self.path): - dname = entry.d_name - dpath = os.path.join(self.path, dname) - st = self.fs.lstat(dpath) - if dname not in exclude_tracking_ids and stat.S_ISLNK(st.st_mode): - if min_ctime_entry is None or st.st_ctime < min_ctime_entry[1].st_ctime: - min_ctime_entry = (dname, st) - if min_ctime_entry: - try: + try: + min_ctime_entry = None + exclude_tracking_ids = [v[0] for v in exclude] + log.debug("excluded tracking ids: {0}".format(exclude_tracking_ids)) + for entry in list_one_entry_at_a_time(self.fs, self.path): + dname = entry.d_name + dpath = os.path.join(self.path, dname) + st = self.fs.lstat(dpath) + if dname not in exclude_tracking_ids and stat.S_ISLNK(st.st_mode): + if min_ctime_entry is None or st.st_ctime < min_ctime_entry[1].st_ctime: + min_ctime_entry = (dname, st) + if min_ctime_entry: linklen = min_ctime_entry[1].st_size sink_path = self.fs.readlink(os.path.join(self.path, min_ctime_entry[0]), CloneIndex.PATH_MAX) return (min_ctime_entry[0], sink_path[:linklen]) - except cephfs.Error as e: - raise IndexException(-e.args[0], e.args[1]) - return None + return None + except cephfs.Error as e: + log.debug('Exception cephfs.Error has been caught. Printing ' + f'the exception - {e}') + raise IndexException(-e.args[0], e.args[1]) def find_clone_entry_index(self, sink_path): try: -- 2.39.5