From: Samuel Just <sjust@redhat.com>
Date: Mon, 30 Nov 2015 19:13:48 +0000 (-0800)
Subject: matrix: reimpliment Sum
X-Git-Tag: 1.1.0~680^2
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F773%2Fhead;p=teuthology.git

matrix: reimpliment Sum

See the docstring for details on the new implementation.  The
old one didn't really satisfy the minscanlen properties at the
tail of the sequence.

Signed-off-by: Samuel Just <sjust@redhat.com>
---

diff --git a/teuthology/matrix.py b/teuthology/matrix.py
index aa7339ee0..f4d91a584 100644
--- a/teuthology/matrix.py
+++ b/teuthology/matrix.py
@@ -1,6 +1,11 @@
 import os
+import heapq
 from fractions import gcd
 
+def lcm(a, b):
+    return a*b // gcd(a, b)
+def lcml(l):
+    return reduce(lcm, l)
 
 class Matrix:
     """
@@ -195,25 +200,77 @@ class Concat(Matrix):
 class Sum(Matrix):
     """
     We want to mix the subsequences proportionately to their size.
+
+    The intuition is that we map all of the subsequences uniformly
+    onto rational numbers in [0, 1).  The ith subsequence with length
+    l will have index k map onto i*<epsilon> + k*(1/l).  i*<epsilon>
+    ensures that no two subsequences have an index which shares a
+    mapping in [0, 1) as long as <epsilon> is chosen to be small
+    enough.
+
+    Rather than actually dealing with rational numbers, however, we'll
+    instead map onto whole numbers in [0, pseudo_size) where
+    pseudo_size is the lcm of the subsequence lengths * the number of
+    subsequences.  Including the number of subsequences in the product
+    allows us to use 1 as <epsilon>.  For each subsequence, we designate
+    an offset (position in input list) and a multiple (pseudo_size / size)
+    such that the psuedo_index for index i is <offset> + i*<multiple>.
+
+    I don't have a good way to map index to pseudo index, so we'll
+    precompute a mapping in the constructor (self._i_so_sis) from
+    index to (subset_index, subset).
     """
     def __init__(self, item, _submats):
         assert len(_submats) > 0, \
             "Sum requires non-empty _submats"
         self.item = item
 
-        submats = sorted(
-            [((i.size(), ind), i) for (i, ind) in
-             zip(_submats, range(len(_submats)))], reverse=True)
-        self.submats = []
-        self._size = 0
-        for ((size, ind), submat) in submats:
-            self.submats.append((self._size, submat))
-            self._size += size
-        self.submats.reverse()
+        self._pseudo_size = lcml((i.size() for i in _submats)) * len(_submats)
+        self._size = sum((i.size() for i in _submats))
+        self._submats = [
+            ((i, self._pseudo_size / s.size()), s) for (i, s) in \
+            zip(range(len(_submats)), _submats)
+        ]
+
+        def sm_to_pmsl(((offset, multiple), submat)):
+            """
+            submat tuple to pseudo minscanlen
+            """
+            return submat.minscanlen() * multiple
+
+        def index_to_pindex_generator(submats):
+            assert len(submats) > 0, "submats must be non-empty"
+            h = []
+            for (offset, multiple), submat in submats:
+                heapq.heappush(h, (offset, 0, multiple, submat))
+            while True:
+                cur, si, multiple, submat = heapq.heappop(h)
+                heapq.heappush(
+                    h,
+                    (cur + multiple, si + 1, multiple, submat))
+                yield si, submat
+
+        self._i_to_sis = dict(
+            zip(range(self._size), index_to_pindex_generator(self._submats))
+        )
+
+        self._minscanlen = self.pseudo_index_to_index(
+            max(map(sm_to_pmsl, self._submats)))
+
+    def pi_to_sis(self, pi, (offset, multiple)):
+        """
+        max(i) s.t. offset + i*multiple <= pi
+        """
+        if pi < offset:
+            return -1
+        return (pi - offset) / multiple
+
+    def pseudo_index_to_index(self, pi):
+        """
+        Count all pseudoindex values <= pi with corresponding subset indices
+        """
+        return sum((self.pi_to_sis(pi, i) + 1 for i, _ in self._submats)) - 1
 
-        self._minscanlen = max(
-            [(self._size / i.size()) *
-             i.minscanlen() for i in _submats])
     def tostr(self, depth):
         ret = '\t'*depth + "Sum({item}):\n".format(item=self.item)
         return ret + ''.join([i[1].tostr(depth+1) for i in self._submats])
@@ -224,41 +281,9 @@ class Sum(Matrix):
     def size(self):
         return self._size
 
-    def _index(self, _i, submats):
-        """
-        We reduce the N sequence problem to a two sequence problem recursively.
-
-        If we have two sequences M and N of length m and n (n > m wlog), we
-        want to mix an M item into the stream every N / M items.  Once we run
-        out of N, we want to simply finish the M stream.
-        """
-        assert len(submats) > 0, \
-            "_index requires non-empty submats"
-        if len(submats) == 1:
-            return submats[0][1].index(_i)
-        lmat = submats[0][1]
-        lsize = lmat.size()
-
-        rsize = submats[0][0]
-
-        mult = rsize / lsize
-        clen = mult + 1
-        thresh = lsize * clen
-        i = _i % (rsize + lsize)
-        base = (_i / (rsize + lsize))
-        if i < thresh:
-            if i % clen == 0:
-                return lmat.index((i / clen) + (base * lsize))
-            else:
-                return self._index(((i / clen) * mult + ((i % clen) - 1)) +
-                                   (base * rsize),
-                                   submats[1:])
-        else:
-            return self._index(i - lsize, submats[1:])
-
     def index(self, i):
-        return (self.item, self._index(i, self.submats))
-
+        si, submat = self._i_to_sis[i % self._size]
+        return (self.item, submat.index(si))
 
 def generate_lists(result):
     """