ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes

author David Zafman <dzafman@redhat.com>

Wed, 14 Sep 2016 22:43:02 +0000 (15:43 -0700)

committer David Zafman <dzafman@redhat.com>

Wed, 5 Oct 2016 04:30:31 +0000 (21:30 -0700)
author David Zafman <dzafman@redhat.com>
Wed, 14 Sep 2016 22:43:02 +0000 (15:43 -0700)
committer David Zafman <dzafman@redhat.com>
Wed, 5 Oct 2016 04:30:31 +0000 (21:30 -0700)
diff --git a/tasks/ceph_manager.py b/tasks/ceph_manager.py

index 9a4f9aff960c29f6ce0e684653c4d82691284032..876d025457624e5c298c3c84868392f53ec59958 100644 (file)
--- a/tasks/ceph_manager.py
+++ b/tasks/ceph_manager.py
@@ -1561,10 +1561,20 @@ class CephManager:
          Scrub pg and wait for scrubbing to finish
          """
          init = self.get_last_scrub_stamp(pool, pgnum)
+        RESEND_TIMEOUT = 120    # Must be a multiple of SLEEP_TIME
+        FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+        SLEEP_TIME = 10
+        timer = 0
          while init == self.get_last_scrub_stamp(pool, pgnum):
+            assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
              self.log("waiting for scrub type %s" % (stype,))
-            self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
-            time.sleep(10)
+            if (timer % RESEND_TIMEOUT) == 0:
+                self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+                # The first time in this loop is the actual request
+                if timer != 0 and stype == "repair":
+                    self.log("WARNING: Resubmitted a non-idempotent repair")
+            time.sleep(SLEEP_TIME)
+            timer += SLEEP_TIME
  
      def get_single_pg_stats(self, pgid):
          """
author	David Zafman <dzafman@redhat.com>
	Wed, 14 Sep 2016 22:43:02 +0000 (15:43 -0700)
committer	David Zafman <dzafman@redhat.com>
	Wed, 5 Oct 2016 04:30:31 +0000 (21:30 -0700)