From 7cc93751e76e027a5041c065b5b6d17dc39239e1 Mon Sep 17 00:00:00 2001
From: John Spray <john.spray@inktank.com>
Date: Mon, 24 Feb 2014 16:00:37 +0000
Subject: [PATCH] task: Add mds_creation_failure

This is test code to accompany CephFS fix #7485.

Also fix DaemonState.wait_for_exit to clear up its 'proc'
attribute even if it fails, so that subsequent calls to 'restart'
happen properly.

Signed-off-by: John Spray <john.spray@inktank.com>
---
 teuthology/task/ceph.py                 |  6 +-
 teuthology/task/mds_creation_failure.py | 83 +++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 teuthology/task/mds_creation_failure.py

diff --git a/teuthology/task/ceph.py b/teuthology/task/ceph.py
index 3f297bf20a..0f8f45eb99 100644
--- a/teuthology/task/ceph.py
+++ b/teuthology/task/ceph.py
@@ -123,8 +123,10 @@ class DaemonState(object):
         clear remote run command value after waiting for exit.
         """
         if self.proc:
-            run.wait([self.proc])
-            self.proc = None
+            try:
+                run.wait([self.proc])
+            finally:
+                self.proc = None
 
 class CephState(object):
     """
diff --git a/teuthology/task/mds_creation_failure.py b/teuthology/task/mds_creation_failure.py
new file mode 100644
index 0000000000..a3d052fb95
--- /dev/null
+++ b/teuthology/task/mds_creation_failure.py
@@ -0,0 +1,83 @@
+
+import logging
+import contextlib
+import time
+import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Go through filesystem creation with a synthetic failure in an MDS
+    in its 'up:creating' state, to exercise the retry behaviour.
+    """
+    # Grab handles to the teuthology objects of interest
+    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+    if len(mdslist) != 1:
+        # Require exactly one MDS, the code path for creation failure when
+        # a standby is available is different
+        raise RuntimeError("This task requires exactly one MDS")
+
+    mds_id = mdslist[0]
+    (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # Stop the MDS and reset the filesystem so that next start will go into CREATING
+    mds = ctx.daemons.get_daemon('mds', mds_id)
+    mds.stop()
+    data_pool_id = manager.get_pool_num("data")
+    md_pool_id = manager.get_pool_num("metadata")
+    manager.raw_cluster_cmd_result('mds', 'newfs', md_pool_id.__str__(), data_pool_id.__str__(),
+                                   '--yes-i-really-mean-it')
+
+    # Start the MDS with mds_kill_create_at set, it will crash during creation
+    mds.restart_with_args(["--mds_kill_create_at=1"])
+    try:
+        mds.wait_for_exit()
+    except CommandFailedError as e:
+        if e.exitstatus == 1:
+            log.info("MDS creation killed as expected")
+        else:
+            log.error("Unexpected status code %s" % e.exitstatus)
+            raise
+
+    # Since I have intentionally caused a crash, I will clean up the resulting core
+    # file to avoid task.internal.coredump seeing it as a failure.
+    log.info("Removing core file from synthetic MDS failure")
+    mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+    # It should have left the MDS map state still in CREATING
+    status = manager.get_mds_status(mds_id)
+    assert status['state'] == 'up:creating'
+
+    # Start the MDS again without the kill flag set, it should proceed with creation successfully
+    mds.restart()
+
+    # Wait for state ACTIVE
+    t = 0
+    create_timeout = 120
+    while True:
+        status = manager.get_mds_status(mds_id)
+        if status['state'] == 'up:active':
+            log.info("MDS creation completed successfully")
+            break
+        elif status['state'] == 'up:creating':
+            log.info("MDS still in creating state")
+            if t > create_timeout:
+                log.error("Creating did not complete within %ss" % create_timeout)
+                raise RuntimeError("Creating did not complete within %ss" % create_timeout)
+            t += 1
+            time.sleep(1)
+        else:
+            log.error("Unexpected MDS state: %s" % status['state'])
+            assert(status['state'] in ['up:active', 'up:creating'])
+
+    # The system should be back up in a happy healthy state, go ahead and run any further tasks
+    # inside this context.
+    yield
-- 
2.39.5