From 9a57759ec3df7a8b9490920538214ac189418c49 Mon Sep 17 00:00:00 2001
From: Samuel Just <sjust@redhat.com>
Date: Fri, 27 Mar 2015 10:20:28 -0700
Subject: [PATCH] suite: add --subset option to allow us to schedule subsets of
 a suite

First, the old logic tended to do the following:

[0,1,2,3] x [a,b,c,d] -> [(0, a), (0, b), (0, c), ...]

which is a bummer since if we only schedule the first 1/4,
we only get combinations with 0.

Instead, we want:

[0,1,2,3] x [a,b,c,d] -> [(0, a), (1, b), (2, c), ...]

Second, the old logic tended to do the following:

[0,1,2,3] + [a,b,c,d] -> [0,1,2,3,a,b,c,d]

Which is a bummer since we need to all the way through the first
set before we get examples from the second.

Instead, we want:

[0,1,2,3] + [a,b,c,d] -> [0,a,1,b,2,c,3,d]

Lastly, if we have:

[0,1,2] + ([0,1,2],[a,b,c])

and want to schedule 1/3 of the suite, we don't want to
just schedule 0, we probably want to schedule all of
[0,1,2] and 1/3 of the second combo.  Thus, we want to
detect such cases and multiply out [0,1,2] into
[0,1,2,0,1,2,0,1,2].

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 scripts/suite.py               |   6 +
 teuthology/matrix.py           | 276 +++++++++++++++++++++++++++++++++
 teuthology/suite.py            | 167 +++++++++++++++-----
 teuthology/test/test_matrix.py |  71 +++++++++
 4 files changed, 477 insertions(+), 43 deletions(-)
 create mode 100644 teuthology/matrix.py
 create mode 100644 teuthology/test/test_matrix.py
diff --git a/scripts/suite.py b/scripts/suite.py
index 088b20c23f..013650c1aa 100644
--- a/scripts/suite.py
+++ b/scripts/suite.py
@@ -59,6 +59,12 @@ Scheduler arguments:
                               [default: 1]
   -l <jobs>, --limit <jobs>   Queue at most this many jobs
                               [default: 0]
+  --subset <index/outof>      Instead of scheduling the entire suite, break the
+                              set of jobs into <outof> pieces (each of which will
+                              contain each facet at least once) and schedule
+                              piece <index>.  Scheduling 0/<outof>, 1/<outof>,
+                              2/<outof> ... <outof>-1/<outof> will schedule all
+                              jobs in the suite (many more than once).
   -p <priority>, --priority <priority>
                               Job priority (lower is sooner)
                               [default: 1000]
diff --git a/teuthology/matrix.py b/teuthology/matrix.py
new file mode 100644
index 0000000000..3f76cea193
--- /dev/null
+++ b/teuthology/matrix.py
@@ -0,0 +1,276 @@
+import os
+from fractions import gcd
+
+
+class Matrix:
+    """
+    Interface for sets
+    """
+    def size(self):
+        pass
+
+    def index(self, i):
+        """
+        index() should return a recursive structure represending the paths
+        to concatenate for index i:
+
+        Result :: (PathSegment, Result) | {Result}
+        Path :: string
+
+        {Result} is a frozen_set of Results indicating that
+        the set of paths resulting from each of the contained
+        Results should be concatenated.  (PathSegment, Result)
+        indicates that PathSegment should be prepended to the
+        paths resulting from Result.
+        """
+        pass
+
+    def minscanlen(self):
+        """
+        min run require to get a good sample
+        """
+        pass
+
+    def cyclicity(self):
+        """
+        A cyclicity of N means that the set represented by the Matrix
+        can be chopped into N good subsets of sequential indices.
+        """
+        return self.size() / self.minscanlen()
+
+
+class Cycle(Matrix):
+    """
+    Run a matrix multiple times
+    """
+    def __init__(self, num, mat):
+        self.mat = mat
+        self.num = num
+
+    def size(self):
+        return self.mat.size() * self.num
+
+    def index(self, i):
+        return self.mat.index(i % self.mat.size())
+
+    def minscanlen(self):
+        return self.mat.minscanlen()
+
+
+class Base(Matrix):
+    """
+    Just a single item.
+    """
+    def __init__(self, item):
+        self.item = item
+
+    def size(self):
+        return 1
+
+    def index(self, i):
+        return self.item
+
+    def minscanlen(self):
+        return 1
+
+
+class Product(Matrix):
+    """
+    Builds items by taking one item from each submatrix.  Contiguous
+    subsequences should move through all dimensions.
+    """
+    def __init__(self, item, _submats):
+        assert len(_submats) > 0, \
+            "Product requires child submats to be passed in"
+        self.item = item
+
+        submats = sorted(
+            [((i.size(), ind), i) for (i, ind) in
+             zip(_submats, range(len(_submats)))], reverse=True)
+        self.submats = []
+        self._size = 1
+        for ((size, _), submat) in submats:
+            self.submats.append((self._size, submat))
+            self._size *= size
+        self.submats.reverse()
+
+        self._minscanlen = max([i.minscanlen() for i in _submats])
+
+    def minscanlen(self):
+        return self._minscanlen
+
+    def size(self):
+        return self._size
+
+    def _index(self, i, submats):
+        """
+        We recursively reduce the N dimension problem to a two
+        dimension problem.
+
+        index(i) = (lmat.index(i % lmat.size()), rmat.index(i %
+        rmat.size())) would simply work if lmat.size() and rmat.size()
+        are relatively prime.
+
+        In general, if the gcd(lmat.size(), rmat.size()) == N,
+        index(i) would be periodic on the interval (lmat.size() *
+        rmat.size()) / N.  To adjust, we increment the lmat index
+        number on each repeat.  Each of the N repeats must therefore
+        be distinct from the previous ones resulting in lmat.size() *
+        rmat.size() combinations.
+        """
+        assert len(submats) > 0, \
+            "_index requires non-empty submats"
+        if len(submats) == 1:
+            return frozenset([submats[0][1].index(i)])
+
+        lmat = submats[0][1]
+        lsize = lmat.size()
+
+        rsize = submats[0][0]
+
+        cycles = gcd(rsize, lsize)
+        clen = (rsize * lsize) / cycles
+        off = (i / clen) % cycles
+
+        def combine(r, s=frozenset()):
+            if type(r) is frozenset:
+                return s | r
+            return s | frozenset([r])
+
+        litems = lmat.index(i + off)
+        ritems = self._index(i, submats[1:])
+        return combine(litems, combine(ritems))
+
+    def index(self, i):
+        items = self._index(i, self.submats)
+        return (self.item, items)
+
+class Concat(Matrix):
+    """
+    Concatenates all items in child matrices
+    """
+    def __init__(self, item, submats):
+        self.submats = submats
+        self.item = item
+
+    def size(self):
+        return 1
+
+    def minscanlen(self):
+        return 1
+
+    def index(self, i):
+        out = frozenset()
+        for submat in self.submats:
+            for i in range(submat.size()):
+                out = out | frozenset([submat.index(i)])
+        return out
+
+class Sum(Matrix):
+    """
+    We want to mix the subsequences proportionately to their size.
+    """
+    def __init__(self, item, _submats):
+        assert len(_submats) > 0, \
+            "Sum requires non-empty _submats"
+        self.item = item
+
+        submats = sorted(
+            [((i.size(), ind), i) for (i, ind) in
+             zip(_submats, range(len(_submats)))], reverse=True)
+        self.submats = []
+        self._size = 0
+        for ((size, ind), submat) in submats:
+            self.submats.append((self._size, submat))
+            self._size += size
+        self.submats.reverse()
+
+        self._minscanlen = max(
+            [(self._size / i.size()) *
+             i.minscanlen() for i in _submats])
+
+    def minscanlen(self):
+        return self._minscanlen
+
+    def size(self):
+        return self._size
+
+    def _index(self, _i, submats):
+        """
+        We reduce the N sequence problem to a two sequence problem recursively.
+
+        If we have two sequences M and N of length m and n (n > m wlog), we
+        want to mix an M item into the stream every N / M items.  Once we run
+        out of N, we want to simply finish the M stream.
+        """
+        assert len(submats) > 0, \
+            "_index requires non-empty submats"
+        if len(submats) == 1:
+            return submats[0][1].index(_i)
+        lmat = submats[0][1]
+        lsize = lmat.size()
+
+        rsize = submats[0][0]
+
+        mult = rsize / lsize
+        clen = mult + 1
+        thresh = lsize * clen
+        i = _i % (rsize + lsize)
+        base = (_i / (rsize + lsize))
+        if i < thresh:
+            if i % clen == 0:
+                return lmat.index((i / clen) + (base * lsize))
+            else:
+                return self._index(((i / clen) * mult + ((i % clen) - 1)) +
+                                   (base * rsize),
+                                   submats[1:])
+        else:
+            return self._index(i - lsize, submats[1:])
+
+    def index(self, i):
+        return (self.item, self._index(i, self.submats))
+
+
+def generate_lists(result):
+    """
+    Generates a set of tuples representing paths to concatenate
+    """
+    if type(result) is frozenset:
+        ret = []
+        for i in result:
+            ret.extend(generate_lists(i))
+        return frozenset(ret)
+    elif type(result) is tuple:
+        ret = []
+        (item, children) = result
+        for f in generate_lists(children):
+            nf = [item]
+            nf.extend(f)
+            ret.append(tuple(nf))
+        return frozenset(ret)
+    else:
+        return frozenset([(result,)])
+
+
+def generate_paths(path, result, joinf=os.path.join):
+    """
+    Generates from the result set a list of sorted paths to concatenate
+    """
+    return [reduce(joinf, i, path) for i in sorted(generate_lists(result))]
+
+
+def generate_desc(joinf, result):
+    """
+    Generates the text description of the test represented by result
+    """
+    if type(result) is frozenset:
+        ret = []
+        for i in sorted(result):
+            ret.append(generate_desc(joinf, i))
+        return '{' + ' '.join(ret) + '}'
+    elif type(result) is tuple:
+        (item, children) = result
+        cdesc = generate_desc(joinf, children)
+        return joinf(str(item), cdesc)
+    else:
+        return str(result)
diff --git a/teuthology/suite.py b/teuthology/suite.py
index 214d2a5c93..2db71e6d31 100644
--- a/teuthology/suite.py
+++ b/teuthology/suite.py
@@ -4,7 +4,6 @@
 
 import copy
 from datetime import datetime
-import itertools
 import logging
 import os
 import requests
@@ -14,10 +13,12 @@ import smtplib
 import socket
 import sys
 import yaml
+import math
 from email.mime.text import MIMEText
 from tempfile import NamedTemporaryFile
 
 import teuthology
+import matrix
 from . import lock
 from .config import config, JobConfig
 from .exceptions import BranchNotFoundError, ScheduleFailError
@@ -59,6 +60,11 @@ def main(args):
     filter_in = args['--filter']
     filter_out = args['--filter-out']
 
+    subset = None
+    if args['--subset']:
+        # take input string '2/3' and turn into (2, 3)
+        subset = tuple(map(int, args['--subset'].split('/')))
+
     name = make_run_name(suite, ceph_branch, kernel_branch, kernel_flavor,
                          machine_type)
 
@@ -102,6 +108,7 @@ def main(args):
                          verbose=verbose,
                          filter_in=filter_in,
                          filter_out=filter_out,
+                         subset=subset,
                          )
     os.remove(base_yaml_path)
 
@@ -240,7 +247,9 @@ def create_initial_config(suite, suite_branch, ceph_branch, teuthology_branch,
 
 def prepare_and_schedule(job_config, suite_repo_path, base_yaml_paths, limit,
                          num, timeout, dry_run, verbose,
-                         filter_in, filter_out):
+                         filter_in,
+                         filter_out,
+                         subset):
     """
     Puts together some "base arguments" with which to execute
     teuthology-schedule for each job, then passes them and other parameters to
@@ -281,6 +290,7 @@ def prepare_and_schedule(job_config, suite_repo_path, base_yaml_paths, limit,
         dry_run=dry_run,
         filter_in=filter_in,
         filter_out=filter_out,
+        subset=subset
     )
 
     if job_config.email and num_jobs:
@@ -456,6 +466,7 @@ def schedule_suite(job_config,
                    dry_run=True,
                    filter_in=None,
                    filter_out=None,
+                   subset=None
                    ):
     """
     schedule one suite.
@@ -464,7 +475,7 @@ def schedule_suite(job_config,
     suite_name = job_config.suite
     log.debug('Suite %s in %s' % (suite_name, path))
     configs = [(combine_path(suite_name, item[0]), item[1]) for item in
-               build_matrix(path)]
+               build_matrix(path, subset=subset)]
     log.info('Suite %s in %s generated %d jobs (not yet filtered)' % (
         suite_name, path, len(configs)))
 
@@ -675,8 +686,7 @@ def combine_path(left, right):
     return left
 
 
-def build_matrix(path, _isfile=os.path.isfile, _isdir=os.path.isdir,
-                 _listdir=os.listdir):
+def generate_combinations(path, mat, generate_from, generate_to):
     """
     Return a list of items describe by path
 
@@ -695,6 +705,51 @@ def build_matrix(path, _isfile=os.path.isfile, _isdir=os.path.isdir,
     for each item in the directory, and then do a product to generate
     a result list with all combinations.
 
+    The final description (after recursion) for each item will look
+    like a relative path.  If there was a % product, that path
+    component will appear as a file with braces listing the selection
+    of chosen subitems.
+    """
+    ret = []
+    for i in range(generate_from, generate_to):
+        output = mat.index(i)
+        ret.append((
+            matrix.generate_desc(combine_path, output),
+            matrix.generate_paths(path, output, combine_path)))
+    return ret
+
+
+def build_matrix(path, _isfile=os.path.isfile,
+                 _isdir=os.path.isdir,
+                 _listdir=os.listdir,
+                 subset=None):
+    """
+    Return a list of items descibed by path such that if the list of
+    items is chunked into mincyclicity pieces, each piece is still a
+    good subset of the suite.
+
+    A good subset of a product ensures that each facet member appears
+    at least once.  A good subset of a sum ensures that the subset of
+    each sub collection reflected in the subset is a good subset.
+
+    A mincyclicity of 0 does not attempt to enforce the good subset
+    property.
+
+    The input is just a path.  The output is an array of (description,
+    [file list]) tuples.
+
+    For a normal file we generate a new item for the result list.
+
+    For a directory, we (recursively) generate a new item for each
+    file/dir.
+
+    For a directory with a magic '+' file, we generate a single item
+    that concatenates all files/subdirs (A Sum).
+
+    For a directory with a magic '%' file, we generate a result set
+    for each item in the directory, and then do a product to generate
+    a result list with all combinations (A Product).
+
     The final description (after recursion) for each item will look
     like a relative path.  If there was a % product, that path
     component will appear as a file with braces listing the selection
@@ -704,56 +759,82 @@ def build_matrix(path, _isfile=os.path.isfile, _isdir=os.path.isdir,
     :param _isfile:     Custom os.path.isfile(); for testing only
     :param _isdir:      Custom os.path.isdir(); for testing only
     :param _listdir:	Custom os.listdir(); for testing only
-    """
+    :param subset:	(index, outof)
+    """
+    mat = None
+    first = None
+    matlimit = None
+    if subset:
+        (index, outof) = subset
+        mat = _build_matrix(path, _isfile, _isdir, _listdir, mincyclicity=outof)
+        first = (mat.size() / outof) * index
+        if index == outof or index == outof - 1:
+            matlimit = mat.size()
+        else:
+            matlimit = (mat.size() / outof) * (index + 1)
+    else:
+        first = 0
+        mat = _build_matrix(path, _isfile, _isdir, _listdir)
+        matlimit = mat.size()
+    return generate_combinations(path, mat, first, matlimit)
+
+def _build_matrix(path, _isfile=os.path.isfile,
+                  _isdir=os.path.isdir, _listdir=os.listdir, mincyclicity=0, item=''):
     if _isfile(path):
         if path.endswith('.yaml'):
-            return [(None, [path])]
-        return []
+            return matrix.Base(item)
+        assert False, "Invalid file seen in _build_matrix"
+        return None
     if _isdir(path):
         files = sorted(_listdir(path))
         if '+' in files:
             # concatenate items
             files.remove('+')
-            raw = []
-            for fn in files:
-                raw.extend(
-                    build_matrix(os.path.join(path, fn),
-                                 _isfile, _isdir, _listdir)
-                )
-            out = [(
-                '{' + ' '.join(files) + '}',
-                [a[1][0] for a in raw]
-            )]
-            return out
+            submats = []
+            for fn in sorted(files):
+                submats.append(
+                    _build_matrix(
+                        os.path.join(path, fn),
+                        _isfile,
+                        _isdir,
+                        _listdir,
+                        mincyclicity,
+                        fn))
+            return matrix.Concat(item, submats)
         elif '%' in files:
             # convolve items
             files.remove('%')
-            sublists = []
-            for fn in files:
-                raw = build_matrix(os.path.join(path, fn),
-                                   _isfile, _isdir, _listdir)
-                if raw:
-                    sublists.append([(combine_path(fn, item[0]), item[1])
-                                     for item in raw])
-            out = []
-            if sublists:
-                for sublist in itertools.product(*sublists):
-                    name = '{' + ' '.join([item[0] for item in sublist]) + '}'
-                    val = []
-                    for item in sublist:
-                        val.extend(item[1])
-                    out.append((name, val))
-            return out
+            submats = []
+            for fn in sorted(files):
+                submat = _build_matrix(
+                    os.path.join(path, fn),
+                    _isfile,
+                    _isdir,
+                    _listdir,
+                    mincyclicity=0,
+                    item=fn)
+                submats.append(submat)
+            return matrix.Product(item, submats)
         else:
             # list items
-            out = []
-            for fn in files:
-                raw = build_matrix(os.path.join(path, fn),
-                                   _isfile, _isdir, _listdir)
-                out.extend([(combine_path(fn, item[0]), item[1])
-                           for item in raw])
-            return out
-    return []
+            submats = []
+            for fn in sorted(files):
+                submat = _build_matrix(
+                    os.path.join(path, fn),
+                    _isfile,
+                    _isdir,
+                    _listdir,
+                    mincyclicity,
+                    fn)
+                if submat.cyclicity() < mincyclicity:
+                    submat = matrix.Cycle(
+                        int(math.ceil(
+                            mincyclicity / submat.cyclicity())),
+                        submat)
+                submats.append(submat)
+            return matrix.Sum(item, submats)
+    assert False, "Invalid path seen in _build_matrix"
+    return None
 
 
 def get_arch(machine_type):
diff --git a/teuthology/test/test_matrix.py b/teuthology/test/test_matrix.py
new file mode 100644
index 0000000000..bad5efc494
--- /dev/null
+++ b/teuthology/test/test_matrix.py
@@ -0,0 +1,71 @@
+from .. import matrix
+
+def verify_matrix_output_diversity(res):
+    """
+    Verifies that the size of the matrix passed matches the number of unique
+    outputs from res.index
+    """
+    sz = res.size()
+    s = frozenset([matrix.generate_lists(res.index(i)) for i in range(sz)])
+    for i in range(res.size()):
+        assert sz == len(s)
+
+def mbs(num, l):
+    return matrix.Sum(num*10, [matrix.Base(i + (100*num)) for i in l])
+
+class TestMatrix(object):
+    def test_simple(self):
+        verify_matrix_output_diversity(mbs(1, range(6)))
+
+    def test_simple2(self):
+        verify_matrix_output_diversity(mbs(1, range(5)))
+
+    # The test_product* tests differ by the degree by which dimension
+    # sizes share prime factors
+    def test_product_simple(self):
+        verify_matrix_output_diversity(
+            matrix.Product(1, [mbs(1, range(6)), mbs(2, range(2))]))
+
+    def test_product_3_facets_2_prime_factors(self):
+        verify_matrix_output_diversity(matrix.Product(1, [
+                    mbs(1, range(6)),
+                    mbs(2, range(2)),
+                    mbs(3, range(3)),
+                    ]))
+
+    def test_product_3_facets_2_prime_factors_one_larger(self):
+        verify_matrix_output_diversity(matrix.Product(1, [
+                    mbs(1, range(2)),
+                    mbs(2, range(5)),
+                    mbs(4, range(4)),
+                    ]))
+
+    def test_product_4_facets_2_prime_factors(self):
+        verify_matrix_output_diversity(matrix.Sum(1, [
+                    mbs(1, range(6)),
+                    mbs(3, range(3)),
+                    mbs(2, range(2)),
+                    mbs(4, range(9)),
+                    ]))
+
+    def test_product_2_facets_2_prime_factors(self):
+        verify_matrix_output_diversity(matrix.Sum(1, [
+                    mbs(1, range(2)),
+                    mbs(2, range(5)),
+                    ]))
+
+    def test_product_with_sum(self):
+        verify_matrix_output_diversity(matrix.Sum(
+                9,
+                [
+                    mbs(10, range(6)),
+                    matrix.Product(1, [
+                            mbs(1, range(2)),
+                            mbs(2, range(5)),
+                            mbs(4, range(4))]),
+                    matrix.Product(8, [
+                            mbs(7, range(2)),
+                            mbs(6, range(5)),
+                            mbs(5, range(4))])
+                    ]
+                ))
-- 
2.39.5