]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes
authorDavid Zafman <dzafman@redhat.com>
Wed, 14 Sep 2016 22:43:02 +0000 (15:43 -0700)
committerDavid Zafman <dzafman@redhat.com>
Wed, 5 Oct 2016 04:30:31 +0000 (21:30 -0700)
Add 6 minute fatal timeout
Warn when repair is being resubmitted because it causes races

Signed-off-by: David Zafman <dzafman@redhat.com>
tasks/ceph_manager.py

index 9a4f9aff960c29f6ce0e684653c4d82691284032..876d025457624e5c298c3c84868392f53ec59958 100644 (file)
@@ -1561,10 +1561,20 @@ class CephManager:
         Scrub pg and wait for scrubbing to finish
         """
         init = self.get_last_scrub_stamp(pool, pgnum)
+        RESEND_TIMEOUT = 120    # Must be a multiple of SLEEP_TIME
+        FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+        SLEEP_TIME = 10
+        timer = 0
         while init == self.get_last_scrub_stamp(pool, pgnum):
+            assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
             self.log("waiting for scrub type %s" % (stype,))
-            self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
-            time.sleep(10)
+            if (timer % RESEND_TIMEOUT) == 0:
+                self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+                # The first time in this loop is the actual request
+                if timer != 0 and stype == "repair":
+                    self.log("WARNING: Resubmitted a non-idempotent repair")
+            time.sleep(SLEEP_TIME)
+            timer += SLEEP_TIME
 
     def get_single_pg_stats(self, pgid):
         """