]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/volumes: unregister job upon async threads exception 33569/head
authorVenky Shankar <vshankar@redhat.com>
Wed, 26 Feb 2020 04:52:37 +0000 (23:52 -0500)
committerVenky Shankar <vshankar@redhat.com>
Thu, 27 Feb 2020 04:44:46 +0000 (23:44 -0500)
If the async threads hit a temporary exception the job is
never unregistered and therefore gets skipped by the async
threads on subsequent scans.

Patrick hit this in nautilus when one of the purge threads
hit an exception when trying to log a message. The trash
entry was never picked up again by the purge threads.

Fixes: http://tracker.ceph.com/issues/44315
Signed-off-by: Venky Shankar <vshankar@redhat.com>
(cherry picked from commit 46476ef2e290bd15af2fec2410cb4f3f86b27cd2)

src/pybind/mgr/volumes/fs/async_job.py

index b47b45ca87bf5b19241a9861551094c744911fa7..7eb8ca2c37fe365331fd6d152b78b25b2bd6363a 100644 (file)
@@ -28,6 +28,7 @@ class JobThread(threading.Thread):
         thread_name = thread_id.getName()
 
         while retries < JobThread.MAX_RETRIES_ON_EXCEPTION:
+            vol_job = None
             try:
                 # fetch next job to execute
                 with self.async_job.lock:
@@ -40,10 +41,6 @@ class JobThread(threading.Thread):
 
                 # execute the job (outside lock)
                 self.async_job.execute_job(vol_job[0], vol_job[1], should_cancel=lambda: thread_id.should_cancel())
-
-                # when done, unregister the job
-                with self.async_job.lock:
-                    self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id)
                 retries = 0
             except NotImplementedException:
                 raise
@@ -56,7 +53,12 @@ class JobThread(threading.Thread):
                 exc_type, exc_value, exc_traceback = sys.exc_info()
                 log.warning("traceback: {0}".format("".join(
                     traceback.format_exception(exc_type, exc_value, exc_traceback))))
-            time.sleep(1)
+            finally:
+                # when done, unregister the job
+                if vol_job:
+                    with self.async_job.lock:
+                        self.async_job.unregister_async_job(vol_job[0], vol_job[1], thread_id)
+                time.sleep(1)
         log.error("thread [{0}] reached exception limit, bailing out...".format(thread_name))
         self.vc.cluster_log("thread {0} bailing out due to exception".format(thread_name))