From: David Zafman Date: Wed, 14 Sep 2016 22:43:02 +0000 (-0700) Subject: ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes X-Git-Tag: v11.1.1~58^2^2~84^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=60cdb05380ab861fafadfd159602f705c22eaefd;p=ceph.git ceph_manager: do_pg_scrub() don't resubmit a request for 2 minutes Add 6 minute fatal timeout Warn when repair is being resubmitted because it causes races Signed-off-by: David Zafman --- diff --git a/tasks/ceph_manager.py b/tasks/ceph_manager.py index 9a4f9aff960c..876d02545762 100644 --- a/tasks/ceph_manager.py +++ b/tasks/ceph_manager.py @@ -1561,10 +1561,20 @@ class CephManager: Scrub pg and wait for scrubbing to finish """ init = self.get_last_scrub_stamp(pool, pgnum) + RESEND_TIMEOUT = 120 # Must be a multiple of SLEEP_TIME + FATAL_TIMEOUT = RESEND_TIMEOUT * 3 + SLEEP_TIME = 10 + timer = 0 while init == self.get_last_scrub_stamp(pool, pgnum): + assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype self.log("waiting for scrub type %s" % (stype,)) - self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum)) - time.sleep(10) + if (timer % RESEND_TIMEOUT) == 0: + self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum)) + # The first time in this loop is the actual request + if timer != 0 and stype == "repair": + self.log("WARNING: Resubmitted a non-idempotent repair") + time.sleep(SLEEP_TIME) + timer += SLEEP_TIME def get_single_pg_stats(self, pgid): """