From 22825c25a5939047134e443f0395cf75894fec16 Mon Sep 17 00:00:00 2001
From: John Spray <john.spray@inktank.com>
Date: Thu, 27 Feb 2014 14:17:54 +0000
Subject: [PATCH] mds_thrash: Refactor gevent usage + get traceback

This simplifies the code to make MdsTrash be a greenlet
(as it logically is) rather than encapsulating one that
gets started in __init__ (spawning threads in constructors
is evil).

With this done, do_thrash is called from _run inside an
exception handler that will give us full tracebacks
if something bad happens.

Signed-off-by: John Spray <john.spray@inktank.com>
---
 teuthology/task/mds_thrash.py | 48 ++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/teuthology/task/mds_thrash.py b/teuthology/task/mds_thrash.py
index 40186e6296c9..7b77d7943ed7 100644
--- a/teuthology/task/mds_thrash.py
+++ b/teuthology/task/mds_thrash.py
@@ -6,13 +6,14 @@ import contextlib
 import ceph_manager
 import random
 import time
-import gevent
+from gevent.greenlet import Greenlet
+from gevent.event import Event
 from teuthology import misc as teuthology
 
 log = logging.getLogger(__name__)
 
 
-class MDSThrasher:
+class MDSThrasher(Greenlet):
     """
     MDSThrasher::
 
@@ -34,7 +35,7 @@ class MDSThrasher:
 
     max_revive_delay: [default: 10] maximum number of seconds to delay before
       bringing back a thrashed MDS
-  
+
     thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
       during replay.  Value should be between 0.0 and 1.0
 
@@ -82,11 +83,13 @@ class MDSThrasher:
     """
 
     def __init__(self, ctx, manager, config, logger, failure_group, weight):
+        super(MDSThrasher, self).__init__()
+
         self.ctx = ctx
         self.manager = manager
-        self.manager.wait_for_clean()
+        assert self.manager.is_clean()
 
-        self.stopping = False
+        self.stopping = Event()
         self.logger = logger
         self.config = config
 
@@ -100,19 +103,24 @@ class MDSThrasher:
 
         self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
 
-        self.thread = gevent.spawn(self.do_thrash)
-
         self.failure_group = failure_group
         self.weight = weight
 
+    def _run(self):
+        try:
+            self.do_thrash()
+        except:
+            # Log exceptions here so we get the full backtrace (it's lost
+            # by the time someone does a .get() on this greenlet)
+            self.logger.exception("Exception in do_thrash:")
+            raise
+
     def log(self, x):
         """Write data to logger assigned to this MDThrasher"""
         self.logger.info(x)
 
-    def do_join(self):
-        """Thread finished"""
-        self.stopping = True
-        self.thread.get()
+    def stop(self):
+        self.stopping.set()
 
     def do_thrash(self):
         """
@@ -120,14 +128,16 @@ class MDSThrasher:
         """
         self.log('starting mds_do_thrash for failure group: ' + ', '.join(
             ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
-        while not self.stopping:
+        while not self.stopping.is_set():
             delay = self.max_thrash_delay
             if self.randomize:
                 delay = random.randrange(0.0, self.max_thrash_delay)
 
             if delay > 0.0:
                 self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
-                time.sleep(delay)
+                self.stopping.wait(delay)
+                if self.stopping.is_set():
+                    continue
 
             skip = random.randrange(0.0, 1.0)
             if self.weight < 1.0 and skip > self.weight:
@@ -147,7 +157,6 @@ class MDSThrasher:
             self.manager.kill_mds_by_rank(active_rank)
 
             # wait for mon to report killed mds as crashed
-            status = {}
             last_laggy_since = None
             itercount = 0
             while True:
@@ -305,15 +314,16 @@ def task(ctx, config):
                 log.info('Assigning mds rank {r} to failure group {g}'.format(r=r, g=active))
                 failure_groups[active].append(r)
 
+    manager.wait_for_clean()
     for (active, standbys) in failure_groups.iteritems():
-
         weight = 1.0
         if 'thrash_weights' in config:
             weight = int(config['thrash_weights'].get('mds.{_id}'.format(_id=active), '0.0'))
 
         failure_group = [active]
         failure_group.extend(standbys)
-        thrashers[active] = MDSThrasher(
+
+        thrasher = MDSThrasher(
             ctx, manager, config,
             logger=log.getChild('mds_thrasher.failure_group.[{a}, {sbs}]'.format(
                 a=active,
@@ -322,6 +332,9 @@ def task(ctx, config):
             ),
             failure_group=failure_group,
             weight=weight)
+        thrasher.start()
+        thrashers[active] = thrasher
+
         # if thrash_weights isn't specified and we've reached max_thrash,
         # we're done
         if not 'thrash_weights' in config and len(thrashers) == max_thrashers:
@@ -334,5 +347,6 @@ def task(ctx, config):
         log.info('joining mds_thrashers')
         for t in thrashers:
             log.info('join thrasher for failure group [{fg}]'.format(fg=', '.join(failure_group)))
-            thrashers[t].do_join()
+            thrashers[t].stop()
+            thrashers[t].join()
         log.info('done joining')
-- 
2.47.3