From c3c0b080f6ea6b3b1e10cb3a13962ac1fdcfdd48 Mon Sep 17 00:00:00 2001
From: Zack Cerza <zack@cerza.org>
Date: Wed, 5 Mar 2014 11:17:13 -0600
Subject: [PATCH] Add a 6h timeout to workunits

The timeout is configurable, but defaults to six hours. It's implemented
by using the 'timeout' command on the remote host.

Signed-off-by: Zack Cerza <zack.cerza@inktank.com>
---
 teuthology/task/workunit.py | 70 ++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/teuthology/task/workunit.py b/teuthology/task/workunit.py
index 182fb91fce..fa34c7d70b 100644
--- a/teuthology/task/workunit.py
+++ b/teuthology/task/workunit.py
@@ -11,6 +11,7 @@ from ..orchestra import run
 
 log = logging.getLogger(__name__)
 
+
 def task(ctx, config):
     """
     Run ceph on all workunits found under the specified path.
@@ -39,7 +40,8 @@ def task(ctx, config):
     on each client simultaneously, AFTER running any workunits specified
     for individual clients. (This prevents unintended simultaneous runs.)
 
-    To customize tests, you can specify environment variables as a dict::
+    To customize tests, you can specify environment variables as a dict. You
+    can also specify a time limit for each work unit (defaults to 6h):
 
         tasks:
         - ceph:
@@ -51,6 +53,7 @@ def task(ctx, config):
             env:
               FOO: bar
               BAZ: quux
+            timeout: 6h
 
     :param ctx: Context
     :param config: Configuration
@@ -70,6 +73,8 @@ def task(ctx, config):
     if refspec is None:
         refspec = 'HEAD'
 
+    timeout = config.get('timeout', '6h')
+
     log.info('Pulling workunits from ref %s', refspec)
 
     created_dir_dict = {}
@@ -91,13 +96,15 @@ def task(ctx, config):
     with parallel() as p:
         for role, tests in clients.iteritems():
             if role != "all":
-                p.spawn(_run_tests, ctx, refspec, role, tests, config.get('env'))
+                p.spawn(_run_tests, ctx, refspec, role, tests,
+                        config.get('env'), timeout=timeout)
             else:
                 all_spec = True
 
     if all_spec:
         all_tasks = clients["all"]
-        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'), config.get('subdir'))
+        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+                              config.get('subdir'))
 
     for role in clients.iterkeys():
         assert isinstance(role, basestring)
@@ -108,6 +115,7 @@ def task(ctx, config):
         if created_dir_dict[role]:
             _delete_dir(ctx, role)
 
+
 def _delete_dir(ctx, role):
     """
     Delete file used by this role, and delete the directory that this
@@ -221,17 +229,13 @@ def _make_scratch_dir(ctx, role, subdir):
 
     return retVal
 
-def _spawn_on_all_clients(ctx, refspec, tests, env, subdir):
+
+def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None):
     """
     Make a scratch directory for each client in the cluster, and then for each
-    test spawn _run_tests for each role.    
+    test spawn _run_tests() for each role.
 
-    :param ctx: Context
-    :param refspec: branch, sha1, or version tag used to identify this
-                    build 
-    :param tests: specific tests specified.
-    :param env: evnironment set in yaml file.  Could be None.
-    :param subdir: subdirectory set in yaml file.  Could be None
+    See run_tests() for parameter documentation.
     """
     client_generator = teuthology.all_roles_of_type(ctx.cluster, 'client')
     client_remotes = list()
@@ -243,25 +247,32 @@ def _spawn_on_all_clients(ctx, refspec, tests, env, subdir):
     for unit in tests:
         with parallel() as p:
             for remote, role in client_remotes:
-                p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir)
+                p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir,
+                        timeout=timeout)
 
     # cleanup the generated client directories
     client_generator = teuthology.all_roles_of_type(ctx.cluster, 'client')
     for client in client_generator:
         _delete_dir(ctx, 'client.{id}'.format(id=client))
 
-def _run_tests(ctx, refspec, role, tests, env, subdir=None):
+
+def _run_tests(ctx, refspec, role, tests, env, subdir=None, timeout=None):
     """
-    Run the individual test.  Create a scratch directory and then extract the workunits
-    from the git-hub.  Make the executables, and then run the tests.
+    Run the individual test. Create a scratch directory and then extract the
+    workunits from git. Make the executables, and then run the tests.
     Clean up (remove files created) after the tests are finished.
 
-    :param ctx: Context
+    :param ctx:     Context
     :param refspec: branch, sha1, or version tag used to identify this
-                    build 
-    :param tests: specific tests specified.
-    :param env: evnironment set in yaml file.  Could be None.
-    :param subdir: subdirectory set in yaml file.  Could be None
+                    build
+    :param tests:   specific tests specified.
+    :param env:     environment set in yaml file.  Could be None.
+    :param subdir:  subdirectory set in yaml file.  Could be None
+    :param timeout: If present, use the 'timeout' command on the remote host
+                    to limit execution time. Must be specified by a number
+                    followed by 's' for seconds, 'm' for minutes, 'h' for
+                    hours, or 'd' for days. If '0' or anything that evaluates
+                    to False is passed, the 'timeout' command is not used.
     """
     testdir = teuthology.get_testdir(ctx)
     assert isinstance(role, basestring)
@@ -332,14 +343,17 @@ def _run_tests(ctx, refspec, role, tests, env, subdir=None):
                         env_arg = '{var}={val}'.format(var=var, val=quoted_val)
                         args.append(run.Raw(env_arg))
                 args.extend([
-                        'adjust-ulimits',
-                        'ceph-coverage',
-                        '{tdir}/archive/coverage'.format(tdir=testdir),
-                        '{srcdir}/{workunit}'.format(
-                            srcdir=srcdir,
-                            workunit=workunit,
-                            ),
-                        ])
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir)])
+                if timeout and timeout != '0':
+                    args.extend(['timeout', timeout])
+                args.extend([
+                    '{srcdir}/{workunit}'.format(
+                        srcdir=srcdir,
+                        workunit=workunit,
+                        ),
+                    ])
                 remote.run(
                     logger=log.getChild(role),
                     args=args,
-- 
2.39.5