tasks/cephfs: further thrasher fixes

author John Spray <john.spray@redhat.com>

Mon, 11 Jan 2016 19:43:40 +0000 (19:43 +0000)

committer John Spray <john.spray@redhat.com>

Fri, 11 Mar 2016 10:39:37 +0000 (10:39 +0000)
author John Spray <john.spray@redhat.com>
Mon, 11 Jan 2016 19:43:40 +0000 (19:43 +0000)
committer John Spray <john.spray@redhat.com>
Fri, 11 Mar 2016 10:39:37 +0000 (10:39 +0000)
diff --git a/tasks/ceph_manager.py b/tasks/ceph_manager.py

index ce13738ed9936058d5606d8afe4a88e8d7fdd559..378a9ece3ef5cb81633a1b07a1fc2e6cd62064d8 100644 (file)
--- a/tasks/ceph_manager.py
+++ b/tasks/ceph_manager.py
@@ -49,17 +49,6 @@ def write_conf(ctx, conf_path=DEFAULT_CONF_PATH):
      run.wait(writes)
  
  
-def make_admin_daemon_dir(ctx, remote):
-    """
-    Create /var/run/ceph directory on remote site.
-
-    :param ctx: Context
-    :param remote: Remote site
-    """
-    remote.run(args=['sudo',
-                     'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
-
-
  def mount_osd_data(ctx, remote, osd):
      """
      Mount a remote OSD
@@ -1785,7 +1774,7 @@ class CephManager:
                                  format(o=osd))
              teuthology.reconnect(self.ctx, 60, [remote])
              mount_osd_data(self.ctx, remote, str(osd))
-            make_admin_daemon_dir(self.ctx, remote)
+            self.make_admin_daemon_dir(remote)
              self.ctx.daemons.get_daemon('osd', osd).reset()
          self.ctx.daemons.get_daemon('osd', osd).restart()
  
@@ -1859,7 +1848,7 @@ class CephManager:
                                                  "Check ipmi config.")
  
              remote.console.power_on()
-            make_admin_daemon_dir(self.ctx, remote)
+            self.make_admin_daemon_dir(remote)
          self.ctx.daemons.get_daemon('mon', mon).restart()
  
      def get_mon_status(self, mon):
@@ -1902,60 +1891,6 @@ class CephManager:
              self.log('health:\n{h}'.format(h=out))
          return json.loads(out)
  
-    ## metadata servers
-
-    def kill_mds(self, mds):
-        """
-        Powercyle if set in config, otherwise just stop.
-        """
-        if self.config.get('powercycle'):
-            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
-                         remotes.iterkeys())
-            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
-                     format(m=mds, s=remote.name))
-            assert remote.console is not None, ("powercycling requested "
-                                                "but RemoteConsole is not "
-                                                "initialized.  "
-                                                "Check ipmi config.")
-            remote.console.power_off()
-        else:
-            self.ctx.daemons.get_daemon('mds', mds).stop()
-
-    def kill_mds_by_rank(self, rank):
-        """
-        kill_mds wrapper to kill based on rank passed.
-        """
-        status = self.get_mds_status_by_rank(rank)
-        self.kill_mds(status['name'])
-
-    def revive_mds(self, mds, standby_for_rank=None):
-        """
-        Revive mds -- do an ipmpi powercycle (if indicated by the config)
-        and then restart (using --hot-standby if specified.
-        """
-        if self.config.get('powercycle'):
-            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
-                         remotes.iterkeys())
-            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
-                     format(m=mds, s=remote.name))
-            assert remote.console is not None, ("powercycling requested "
-                                                "but RemoteConsole is not "
-                                                "initialized.  "
-                                                "Check ipmi config.")
-            remote.console.power_on()
-            make_admin_daemon_dir(self.ctx, remote)
-        args = []
-        if standby_for_rank:
-            args.extend(['--hot-standby', standby_for_rank])
-        self.ctx.daemons.get_daemon('mds', mds).restart(*args)
-
-    def revive_mds_by_rank(self, rank, standby_for_rank=None):
-        """
-        revive_mds wrapper to revive based on rank passed.
-        """
-        status = self.get_mds_status_by_rank(rank)
-        self.revive_mds(status['name'], standby_for_rank)
-
      def get_mds_status(self, mds):
          """
          Run cluster commands for the mds in order to get mds information
@@ -1968,31 +1903,22 @@ class CephManager:
                  return info
          return None
  
-    def get_mds_status_by_rank(self, rank):
+    def get_filepath(self):
          """
-        Run cluster commands for the mds in order to get mds information
-        check rank.
+        Return path to osd data with {id} needing to be replaced
          """
-        j = self.get_mds_status_all()
-        # collate; for dup ids, larger gid wins.
-        for info in j['info'].itervalues():
-            if info['rank'] == rank:
-                return info
-        return None
+        return "/var/lib/ceph/osd/ceph-{id}"
  
-    def get_mds_status_all(self):
-        """
-        Run cluster command to extract all the mds status.
+    def make_admin_daemon_dir(self, remote):
          """
-        out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
-        j = json.loads(' '.join(out.splitlines()[1:]))
-        return j
+        Create /var/run/ceph directory on remote site.
  
-    def get_filepath(self):
-        """
-        Return path to osd data with {id} needing to be replaced
+        :param ctx: Context
+        :param remote: Remote site
          """
-        return "/var/lib/ceph/osd/ceph-{id}"
+        remote.run(args=['sudo',
+                         'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
+
  
  def utility_task(name):
      """
diff --git a/tasks/cephfs/filesystem.py b/tasks/cephfs/filesystem.py

index 0d1e178d1016836b2d63967d877a4caf9c46a298..f9644eaf6b068cb82b4ece327e1c56284ec9d904 100644 (file)
--- a/tasks/cephfs/filesystem.py
+++ b/tasks/cephfs/filesystem.py
@@ -244,6 +244,13 @@ class MDSCluster(object):
  
          return None
  
+    def get_mds_info_by_rank(self, mds_rank):
+        for mds_info in self._all_info():
+            if mds_info['rank'] == mds_rank:
+                return mds_info
+
+        return None
+
  
  class Filesystem(MDSCluster):
      """
diff --git a/tasks/cephfs/test_failover.py b/tasks/cephfs/test_failover.py

index afdfa157d44e3b57aad57edf479f53fa80b10add..7075e2fced07a04f1377f70c378ba57c61e27f6a 100644 (file)
--- a/tasks/cephfs/test_failover.py
+++ b/tasks/cephfs/test_failover.py
@@ -36,7 +36,8 @@ class TestFailover(CephFSTestCase):
              active = self.fs.get_active_names()
              return active and active[0] in original_standbys
  
-        log.info("Waiting for promotion of one of the original standbys {0}".format(original_standbys))
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
          self.wait_until_true(
              promoted,
              timeout=grace*2)
@@ -76,7 +77,7 @@ class TestFailover(CephFSTestCase):
  
          # Wait for everyone to go laggy
          def laggy():
-            mdsmap = self.fs.mon_manager.get_mds_status_all()
+            mdsmap = self.fs.get_mds_map()
              for info in mdsmap['info'].values():
                  if "laggy_since" not in info:
                      return False
diff --git a/tasks/mds_thrash.py b/tasks/mds_thrash.py

index bd6340764a8fdd80d26f404d1672b89f61a73c28..75530f8a8efaca9aad82596f9a2b30007ca60b8d 100644 (file)
--- a/tasks/mds_thrash.py
+++ b/tasks/mds_thrash.py
@@ -110,6 +110,10 @@ class MDSThrasher(Greenlet):
          self.failure_group = failure_group
          self.weight = weight
  
+        # TODO support multiple filesystems: will require behavioural change to select
+        # which filesystem to act on when doing rank-ish things
+        self.fs = Filesystem(self.ctx)
+
      def _run(self):
          try:
              self.do_thrash()
@@ -126,15 +130,63 @@ class MDSThrasher(Greenlet):
      def stop(self):
          self.stopping.set()
  
+    def kill_mds(self, mds):
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.iterkeys())
+            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            assert remote.console is not None, ("powercycling requested "
+                                                "but RemoteConsole is not "
+                                                "initialized.  "
+                                                "Check ipmi config.")
+            remote.console.power_off()
+        else:
+            self.ctx.daemons.get_daemon('mds', mds).stop()
+
+    def kill_mds_by_rank(self, rank):
+        """
+        kill_mds wrapper to kill based on rank passed.
+        """
+        status = self.mds_cluster.get_mds_info_by_rank(rank)
+        self.kill_mds(status['name'])
+
+    def revive_mds(self, mds, standby_for_rank=None):
+        """
+        Revive mds -- do an ipmpi powercycle (if indicated by the config)
+        and then restart (using --hot-standby if specified.
+        """
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.iterkeys())
+            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            assert remote.console is not None, ("powercycling requested "
+                                                "but RemoteConsole is not "
+                                                "initialized.  "
+                                                "Check ipmi config.")
+            remote.console.power_on()
+            self.manager.make_admin_daemon_dir(self.ctx, remote)
+        args = []
+        if standby_for_rank:
+            args.extend(['--hot-standby', standby_for_rank])
+        self.ctx.daemons.get_daemon('mds', mds).restart(*args)
+
+    def revive_mds_by_rank(self, rank, standby_for_rank=None):
+        """
+        revive_mds wrapper to revive based on rank passed.
+        """
+        status = self.mds_cluster.get_mds_info_by_rank(rank)
+        self.revive_mds(status['name'], standby_for_rank)
+
+    def get_mds_status_all(self):
+        return self.fs.get_mds_map()
+
      def do_thrash(self):
          """
          Perform the random thrashing action
          """
  
-        # TODO support multiple filesystems: will require behavioural change to select
-        # which filesystem to act on when doing rank-ish things
-        fs = Filesystem(self.ctx)
-
          self.log('starting mds_do_thrash for failure group: ' + ', '.join(
              ['mds.{_id}'.format(_id=_f) for _f in self.failure_group]))
          while not self.stopping.is_set():
@@ -169,7 +221,7 @@ class MDSThrasher(Greenlet):
              last_laggy_since = None
              itercount = 0
              while True:
-                failed = fs.get_mds_map()['failed']
+                failed = self.fs.get_mds_map()['failed']
                  status = self.mds_cluster.get_mds_info(active_mds)
                  if not status:
                      break
@@ -277,7 +329,6 @@ def task(ctx, config):
          'mds_thrash task requires at least 2 metadata servers'
  
      # choose random seed
-    seed = None
      if 'seed' in config:
          seed = int(config['seed'])
      else:
@@ -349,7 +400,7 @@ def task(ctx, config):
  
          # if thrash_weights isn't specified and we've reached max_thrash,
          # we're done
-        if not 'thrash_weights' in config and len(thrashers) == max_thrashers:
+        if 'thrash_weights' not in config and len(thrashers) == max_thrashers:
              break
  
      try:
author	John Spray <john.spray@redhat.com>
	Mon, 11 Jan 2016 19:43:40 +0000 (19:43 +0000)
committer	John Spray <john.spray@redhat.com>
	Fri, 11 Mar 2016 10:39:37 +0000 (10:39 +0000)
tasks/ceph_manager.py		patch \| blob \| history
tasks/cephfs/filesystem.py		patch \| blob \| history
tasks/cephfs/test_failover.py		patch \| blob \| history
tasks/mds_thrash.py		patch \| blob \| history