]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds_thrash: PEP8-ize whitespace
authorJohn Spray <john.spray@inktank.com>
Thu, 27 Feb 2014 14:09:52 +0000 (14:09 +0000)
committerJohn Spray <john.spray@inktank.com>
Thu, 27 Feb 2014 14:25:13 +0000 (14:25 +0000)
...so that I can edit the code in a python IDE without
it lighting up like a christmas tree!

Signed-off-by: John Spray <john.spray@inktank.com>
teuthology/task/mds_thrash.py

index 8a58932a66a77d69a1e19e4d2b224db775022c81..40186e6296c906f5d6fe643380306ccc85bef20c 100644 (file)
@@ -11,222 +11,231 @@ from teuthology import misc as teuthology
 
 log = logging.getLogger(__name__)
 
-class MDSThrasher:
-  """
-  MDSThrasher::
-
-  The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
-
-  The config is optional.  Many of the config parameters are a a maximum value
-  to use when selecting a random value from a range.  To always use the maximum
-  value, set no_random to true.  The config is a dict containing some or all of:
-
-  seed: [no default] seed the random number generator
-
-  randomize: [default: true] enables randomization and use the max/min values
-
-  max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
-    any given time.
-
-  max_thrash_delay: [default: 30] maximum number of seconds to delay before
-    thrashing again.
-
-  max_revive_delay: [default: 10] maximum number of seconds to delay before
-    bringing back a thrashed MDS
 
-  thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
-    during replay.  Value should be between 0.0 and 1.0
-
-  max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
-    the replay state before thrashing
-
-  thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
-    overrides anything specified by max_thrash.  This option is a dict containing
-    mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
-    is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
-    given a weight of 0.0.  For a given MDS, by default the trasher delays for up
-    to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
-    weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
-    during that iteration based on a random value [0-1] not exceeding the weight of that MDS.
+class MDSThrasher:
+    """
+    MDSThrasher::
 
-  Examples::
+    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
 
+    The config is optional.  Many of the config parameters are a a maximum value
+    to use when selecting a random value from a range.  To always use the maximum
+    value, set no_random to true.  The config is a dict containing some or all of:
 
-    The following example sets the likelihood that mds.a will be thrashed
-    to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
-    likelihood that an MDS will be thrashed in replay to 40%.
-    Thrash weights do not have to sum to 1.
+    seed: [no default] seed the random number generator
 
-    tasks:
-    - ceph:
-    - mds_thrash:
-        thrash_weights:
-          - mds.a: 0.8
-          - mds.b: 0.2
-        thrash_in_replay: 0.4
-    - ceph-fuse:
-    - workunit:
-        clients:
-          all: [suites/fsx.sh]
+    randomize: [default: true] enables randomization and use the max/min values
 
-    The following example disables randomization, and uses the max delay values:
+    max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at
+      any given time.
 
-    tasks:
-    - ceph:
-    - mds_thrash:
-        max_thrash_delay: 10
-        max_revive_delay: 1
-        max_replay_thrash_delay: 4
+    max_thrash_delay: [default: 30] maximum number of seconds to delay before
+      thrashing again.
 
-  """
-  def __init__(self, ctx, manager, config, logger, failure_group, weight):
-    self.ctx = ctx
-    self.manager = manager
-    self.manager.wait_for_clean()
+    max_revive_delay: [default: 10] maximum number of seconds to delay before
+      bringing back a thrashed MDS
+  
+    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
+      during replay.  Value should be between 0.0 and 1.0
 
-    self.stopping = False
-    self.logger = logger
-    self.config = config
+    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
+      the replay state before thrashing
 
-    self.randomize = bool(self.config.get('randomize', True))
-    self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
-    self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
-    assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(v=self.thrash_in_replay)
+    thrash_weights: allows specific MDSs to be thrashed more/less frequently.  This option
+      overrides anything specified by max_thrash.  This option is a dict containing
+      mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0].  Each weight
+      is a value from 0.0 to 1.0.  Any MDSs not specified will be automatically
+      given a weight of 0.0.  For a given MDS, by default the trasher delays for up
+      to max_thrash_delay, trashes, waits for the MDS to recover, and iterates.  If a non-zero
+      weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash
+      during that iteration based on a random value [0-1] not exceeding the weight of that MDS.
 
-    self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
+    Examples::
 
-    self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
 
-    self.thread = gevent.spawn(self.do_thrash)
+      The following example sets the likelihood that mds.a will be thrashed
+      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
+      likelihood that an MDS will be thrashed in replay to 40%.
+      Thrash weights do not have to sum to 1.
 
-    self.failure_group = failure_group
-    self.weight = weight
+      tasks:
+      - ceph:
+      - mds_thrash:
+          thrash_weights:
+            - mds.a: 0.8
+            - mds.b: 0.2
+          thrash_in_replay: 0.4
+      - ceph-fuse:
+      - workunit:
+          clients:
+            all: [suites/fsx.sh]
 
-  def log(self, x):
-    """Write data to logger assigned to this MDThrasher"""
-    self.logger.info(x)
+      The following example disables randomization, and uses the max delay values:
 
-  def do_join(self):
-    """Thread finished"""
-    self.stopping = True
-    self.thread.get()
+      tasks:
+      - ceph:
+      - mds_thrash:
+          max_thrash_delay: 10
+          max_revive_delay: 1
+          max_replay_thrash_delay: 4
 
-  def do_thrash(self):
     """
-    Perform the random thrashing action
-    """
-    self.log('starting mds_do_thrash for failure group: ' + ', '.join(['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
-    while not self.stopping:
-      delay = self.max_thrash_delay
-      if self.randomize:
-          delay = random.randrange(0.0, self.max_thrash_delay)
-
-      if delay > 0.0:
-        self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
-        time.sleep(delay)
-
-      skip = random.randrange(0.0, 1.0)
-      if self.weight < 1.0 and skip > self.weight:
-          self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=self.weight))
-          continue
-
-      # find the active mds in the failure group
-      statuses = [self.manager.get_mds_status(m) for m in self.failure_group]
-      actives = filter(lambda s: s['state'] == 'up:active', statuses)
-      assert len(actives) == 1, 'Can only have one active in a failure group'
-
-      active_mds = actives[0]['name']
-      active_rank = actives[0]['rank']
-
-      self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank))
-      self.manager.kill_mds_by_rank(active_rank)
-
-      # wait for mon to report killed mds as crashed
-      status = {}
-      last_laggy_since = None
-      itercount = 0
-      while True:
-        failed = self.manager.get_mds_status_all()['failed']
-        status = self.manager.get_mds_status(active_mds)
-        if not status:
-            break
-        if 'laggy_since' in status:
-            last_laggy_since = status['laggy_since']
-            break
-        if any([(f == active_mds) for f in failed]):
-            break
-        self.log('waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format(_id=active_mds))
-        itercount = itercount + 1
-        if itercount > 10:
-            self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all()))
-        time.sleep(2)
-      if last_laggy_since:
-        self.log('mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since))
-      else:
-        self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since))
-
-      # wait for a standby mds to takeover and become active
-      takeover_mds = None
-      takeover_rank = None
-      itercount = 0
-      while True:
-        statuses = [self.manager.get_mds_status(m) for m in self.failure_group]
-        actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
-        if len(actives) > 0:
-            assert len(actives) == 1, 'Can only have one active in failure group'
-            takeover_mds = actives[0]['name']
-            takeover_rank = actives[0]['rank']
-            break
-        itercount = itercount + 1
-        if itercount > 10:
-            self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all()))
-
-      self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))
-
-      # wait for a while before restarting old active to become new
-      # standby
-      delay = self.max_revive_delay
-      if self.randomize:
-          delay = random.randrange(0.0, self.max_revive_delay)
-
-      self.log('waiting for {delay} secs before reviving mds.{id}'.format(
-        delay=delay,id=active_mds))
-      time.sleep(delay)
-
-      self.log('reviving mds.{id}'.format(id=active_mds))
-      self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank)
-
-      status = {}
-      while True:
-        status = self.manager.get_mds_status(active_mds)
-        if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'):
-            break
-        self.log('waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds))
-        time.sleep(2)
-      self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state']))
-
-      # don't do replay thrashing right now
-      continue
-      # this might race with replay -> active transition...
-      if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
-
-        delay = self.max_replay_thrash_delay
-        if self.randomize:
-          delay = random.randrange(0.0, self.max_replay_thrash_delay)
-        time.sleep(delay)
-        self.log('kill replaying mds.{id}'.format(id=self.to_kill))
-        self.manager.kill_mds(self.to_kill)
-
-        delay = self.max_revive_delay
-        if self.randomize:
-            delay = random.randrange(0.0, self.max_revive_delay)
-
-        self.log('waiting for {delay} secs before reviving mds.{id}'.format(
-            delay=delay,id=self.to_kill))
-        time.sleep(delay)
 
-        self.log('revive mds.{id}'.format(id=self.to_kill))
-        self.manager.revive_mds(self.to_kill)
+    def __init__(self, ctx, manager, config, logger, failure_group, weight):
+        self.ctx = ctx
+        self.manager = manager
+        self.manager.wait_for_clean()
+
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+
+        self.randomize = bool(self.config.get('randomize', True))
+        self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0))
+        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
+        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
+            v=self.thrash_in_replay)
+
+        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
+
+        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
+
+        self.thread = gevent.spawn(self.do_thrash)
+
+        self.failure_group = failure_group
+        self.weight = weight
+
+    def log(self, x):
+        """Write data to logger assigned to this MDThrasher"""
+        self.logger.info(x)
+
+    def do_join(self):
+        """Thread finished"""
+        self.stopping = True
+        self.thread.get()
+
+    def do_thrash(self):
+        """
+        Perform the random thrashing action
+        """
+        self.log('starting mds_do_thrash for failure group: ' + ', '.join(
+            ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
+        while not self.stopping:
+            delay = self.max_thrash_delay
+            if self.randomize:
+                delay = random.randrange(0.0, self.max_thrash_delay)
+
+            if delay > 0.0:
+                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+                time.sleep(delay)
+
+            skip = random.randrange(0.0, 1.0)
+            if self.weight < 1.0 and skip > self.weight:
+                self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip,
+                                                                                                   weight=self.weight))
+                continue
+
+            # find the active mds in the failure group
+            statuses = [self.manager.get_mds_status(m) for m in self.failure_group]
+            actives = filter(lambda s: s['state'] == 'up:active', statuses)
+            assert len(actives) == 1, 'Can only have one active in a failure group'
+
+            active_mds = actives[0]['name']
+            active_rank = actives[0]['rank']
+
+            self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank))
+            self.manager.kill_mds_by_rank(active_rank)
+
+            # wait for mon to report killed mds as crashed
+            status = {}
+            last_laggy_since = None
+            itercount = 0
+            while True:
+                failed = self.manager.get_mds_status_all()['failed']
+                status = self.manager.get_mds_status(active_mds)
+                if not status:
+                    break
+                if 'laggy_since' in status:
+                    last_laggy_since = status['laggy_since']
+                    break
+                if any([(f == active_mds) for f in failed]):
+                    break
+                self.log(
+                    'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format(
+                        _id=active_mds))
+                itercount = itercount + 1
+                if itercount > 10:
+                    self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all()))
+                time.sleep(2)
+            if last_laggy_since:
+                self.log(
+                    'mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since))
+            else:
+                self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since))
+
+            # wait for a standby mds to takeover and become active
+            takeover_mds = None
+            takeover_rank = None
+            itercount = 0
+            while True:
+                statuses = [self.manager.get_mds_status(m) for m in self.failure_group]
+                actives = filter(lambda s: s and s['state'] == 'up:active', statuses)
+                if len(actives) > 0:
+                    assert len(actives) == 1, 'Can only have one active in failure group'
+                    takeover_mds = actives[0]['name']
+                    takeover_rank = actives[0]['rank']
+                    break
+                itercount = itercount + 1
+                if itercount > 10:
+                    self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all()))
+
+            self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))
+
+            # wait for a while before restarting old active to become new
+            # standby
+            delay = self.max_revive_delay
+            if self.randomize:
+                delay = random.randrange(0.0, self.max_revive_delay)
+
+            self.log('waiting for {delay} secs before reviving mds.{id}'.format(
+                delay=delay, id=active_mds))
+            time.sleep(delay)
+
+            self.log('reviving mds.{id}'.format(id=active_mds))
+            self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank)
+
+            status = {}
+            while True:
+                status = self.manager.get_mds_status(active_mds)
+                if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'):
+                    break
+                self.log(
+                    'waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds))
+                time.sleep(2)
+            self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state']))
+
+            # don't do replay thrashing right now
+            continue
+            # this might race with replay -> active transition...
+            if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
+
+                delay = self.max_replay_thrash_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_replay_thrash_delay)
+                time.sleep(delay)
+                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
+                self.manager.kill_mds(self.to_kill)
+
+                delay = self.max_revive_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_revive_delay)
+
+                self.log('waiting for {delay} secs before reviving mds.{id}'.format(
+                    delay=delay, id=self.to_kill))
+                time.sleep(delay)
+
+                self.log('revive mds.{id}'.format(id=self.to_kill))
+                self.manager.revive_mds(self.to_kill)
 
 
 @contextlib.contextmanager
@@ -268,7 +277,7 @@ def task(ctx, config):
     statuses = None
     statuses_by_rank = None
     while True:
-        statuses = {m : manager.get_mds_status(m) for m in mdslist}
+        statuses = {m: manager.get_mds_status(m) for m in mdslist}
         statuses_by_rank = {}
         for _, s in statuses.iteritems():
             if isinstance(s, dict):
@@ -285,18 +294,18 @@ def task(ctx, config):
 
     # setup failure groups
     failure_groups = {}
-    actives = {s['name'] : s for (_,s) in statuses.iteritems() if s['state'] == 'up:active'}
+    actives = {s['name']: s for (_, s) in statuses.iteritems() if s['state'] == 'up:active'}
     log.info('Actives is: {d}'.format(d=actives))
     log.info('Statuses is: {d}'.format(d=statuses_by_rank))
     for active in actives:
-        for (r,s) in statuses.iteritems():
+        for (r, s) in statuses.iteritems():
             if s['standby_for_name'] == active:
                 if not active in failure_groups:
                     failure_groups[active] = []
                 log.info('Assigning mds rank {r} to failure group {g}'.format(r=r, g=active))
                 failure_groups[active].append(r)
 
-    for (active,standbys) in failure_groups.iteritems():
+    for (active, standbys) in failure_groups.iteritems():
 
         weight = 1.0
         if 'thrash_weights' in config:
@@ -307,10 +316,10 @@ def task(ctx, config):
         thrashers[active] = MDSThrasher(
             ctx, manager, config,
             logger=log.getChild('mds_thrasher.failure_group.[{a}, {sbs}]'.format(
-                                      a=active,
-                                      sbs=', '.join(standbys)
-                                      )
-                                  ),
+                a=active,
+                sbs=', '.join(standbys)
+            )
+            ),
             failure_group=failure_group,
             weight=weight)
         # if thrash_weights isn't specified and we've reached max_thrash,