From: Venky Shankar Date: Wed, 26 Feb 2020 04:52:37 +0000 (-0500) Subject: mgr/volumes: unregister job upon async threads exception X-Git-Tag: v15.1.1~234^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=46476ef2e290bd15af2fec2410cb4f3f86b27cd2;p=ceph-ci.git mgr/volumes: unregister job upon async threads exception If the async threads hit a temporary exception the job is never unregistered and therefore gets skipped by the async threads on subsequent scans. Patrick hit this in nautilus when one of the purge threads hit an exception when trying to log a message. The trash entry was never picked up again by the purge threads. Fixes: http://tracker.ceph.com/issues/44293 Signed-off-by: Venky Shankar --- diff --git a/src/pybind/mgr/volumes/fs/async_job.py b/src/pybind/mgr/volumes/fs/async_job.py index b47b45ca87b..7eb8ca2c37f 100644 --- a/src/pybind/mgr/volumes/fs/async_job.py +++ b/src/pybind/mgr/volumes/fs/async_job.py @@ -28,6 +28,7 @@ class JobThread(threading.Thread): thread_name = thread_id.getName() while retries < JobThread.MAX_RETRIES_ON_EXCEPTION: + vol_job = None try: # fetch next job to execute with self.async_job.lock: @@ -40,10 +41,6 @@ class JobThread(threading.Thread): # execute the job (outside lock) self.async_job.execute_job(vol_job[0], vol_job[1], should_cancel=lambda: thread_id.should_cancel()) - - # when done, unregister the job - with self.async_job.lock: - self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id) retries = 0 except NotImplementedException: raise @@ -56,7 +53,12 @@ class JobThread(threading.Thread): exc_type, exc_value, exc_traceback = sys.exc_info() log.warning("traceback: {0}".format("".join( traceback.format_exception(exc_type, exc_value, exc_traceback)))) - time.sleep(1) + finally: + # when done, unregister the job + if vol_job: + with self.async_job.lock: + self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id) + time.sleep(1) log.error("thread [{0}] reached exception limit, bailing out...".format(thread_name)) self.vc.cluster_log("thread {0} bailing out due to exception".format(thread_name))