From 2bcc60061cc364a0ab79b07316c08735d19ec2d3 Mon Sep 17 00:00:00 2001
From: Warren Usui <warren.usui@inktank.com>
Date: Mon, 3 Feb 2014 17:17:09 -0800
Subject: [PATCH] Implement the rest of the checks for automatic pg scrubbing.
 osd_scrub_pgs now: 1.) Insures that all pgs are clean and active. 2.)
 Performs the scrub of all OSDs. 3.) Waits until all pgs have been scrubbed or
 no progress in scrubbing has     been made for two minutes.

These changes have been made on top of a previous check-in that was incomplete

Fixes: 7198
Signed-off-bye: Warren Usui <warren.usui@inktank.com>
---
 teuthology/task/ceph.py | 87 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 14 deletions(-)

diff --git a/teuthology/task/ceph.py b/teuthology/task/ceph.py
index ac705ccb95..92db5269bc 100644
--- a/teuthology/task/ceph.py
+++ b/teuthology/task/ceph.py
@@ -10,6 +10,8 @@ import contextlib
 import logging
 import os
 import struct
+import json
+import time
 
 from teuthology import misc as teuthology
 from teuthology import contextutil
@@ -1023,23 +1025,82 @@ def cluster(ctx, config):
                 ),
             )
 
-def osd_scrub_pgs(ctx, config):
+def get_all_pg_info(rem_site, testdir):
     """
-    Scrub pgs when we exit.
+    Get the results of a ceph pg dump
     """
-    
-    for remotes in ctx.cluster.remotes:
-        site_info = remotes.values()
-        for st_info in site_info:
-            for role in st_info:
-                if role.startswith('osd.'):
-                    log.info("Scrubbing osd {osd}".format(osd=role))
-                    testdir = teuthology.get_testdir(ctx)
-                    remotes.run(args=[
+    info = rem_site.run(args=[
                         'adjust-ulimits',
                         'ceph-coverage',
                         '{tdir}/archive/coverage'.format(tdir=testdir),
-                        'ceph', 'osd', 'scrub', role])
+                        'ceph', 'pg', 'dump',
+                        '--format', 'json'], stdout=StringIO())
+    all_info = json.loads(info.stdout.getvalue())
+    return all_info['pg_stats']
+    
+def osd_scrub_pgs(ctx, config):
+    """
+    Scrub pgs when we exit.
+    
+    First make sure all pgs are active and clean.
+    Next scrub all osds.
+    Then periodically check until all pgs have scrub time stamps that
+    indicate the last scrub completed.  Time out if no progess is made
+    here after two minutes.
+    """
+    retries = 12
+    delays = 10
+    vlist = ctx.cluster.remotes.values()
+    testdir = teuthology.get_testdir(ctx)
+    rem_site = ctx.cluster.remotes.keys()[0]
+    all_clean = False
+    for _ in range(0, retries):
+	stats = get_all_pg_info(rem_site, testdir)
+        states = [stat['state'] for stat in stats]
+        if len(set(states)) == 1 and states[0] == 'active+clean':
+            all_clean = True
+            break
+        log.info("Waiting for all osds to be active and clean.")
+        time.sleep(delays)
+    if not all_clean:
+        log.info("Scrubbing terminated -- not all pgs were active and clean.")
+        return
+    check_time_now = time.localtime()
+    time.sleep(1)
+    for slists in vlist:
+        for role in slists:
+            if role.startswith('osd.'):
+                log.info("Scrubbing osd {osd}".format(osd=role))
+                rem_site.run(args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'ceph', 'osd', 'scrub', role])
+    prev_good = 0
+    gap_cnt = 0
+    loop = True
+    while loop:
+	stats = get_all_pg_info(rem_site, testdir)
+        timez = [stat['last_scrub_stamp'] for stat in stats]
+        loop = False
+        thiscnt = 0
+        for tmval in timez:
+            pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
+            if pgtm > check_time_now:
+                thiscnt += 1
+            else:
+                loop = True
+        if thiscnt > prev_good:
+            prev_good = thiscnt
+            gap_cnt = 0
+        else:
+            gap_cnt += 1
+            if gap_cnt > retries:
+                log.info('Exiting scrub checking -- not all pgs scrubbed.')
+                return
+        if loop:
+            log.info('Still waiting for all pgs to be scrubbed.')
+            time.sleep(delays)
 
 @contextlib.contextmanager
 def run_daemon(ctx, config, type_):
@@ -1167,8 +1228,6 @@ def wait_for_mon_quorum(ctx, config):
     :param ctx: Context
     :param config: Configuration
     """
-    import json
-    import time
 
     assert isinstance(config, list)
     firstmon = teuthology.get_first_mon(ctx, config)
-- 
2.39.5