matrix: reimpliment Sum

author Samuel Just <sjust@redhat.com>

Mon, 30 Nov 2015 19:13:48 +0000 (11:13 -0800)

committer Loic Dachary <ldachary@redhat.com>

Mon, 1 Feb 2016 05:27:48 +0000 (12:27 +0700)
author Samuel Just <sjust@redhat.com>
Mon, 30 Nov 2015 19:13:48 +0000 (11:13 -0800)
committer Loic Dachary <ldachary@redhat.com>
Mon, 1 Feb 2016 05:27:48 +0000 (12:27 +0700)
diff --git a/teuthology/matrix.py b/teuthology/matrix.py

index aa7339ee0b723be4925908ffdd8af122a18a228a..f4d91a584848a4f3e0e79b56d3aa2995f4e97d7a 100644 (file)
--- a/teuthology/matrix.py
+++ b/teuthology/matrix.py
@@ -1,6 +1,11 @@
  import os
+import heapq
  from fractions import gcd
  
+def lcm(a, b):
+    return a*b // gcd(a, b)
+def lcml(l):
+    return reduce(lcm, l)
  
  class Matrix:
      """
@@ -195,25 +200,77 @@ class Concat(Matrix):
  class Sum(Matrix):
      """
      We want to mix the subsequences proportionately to their size.
+
+    The intuition is that we map all of the subsequences uniformly
+    onto rational numbers in [0, 1).  The ith subsequence with length
+    l will have index k map onto i*<epsilon> + k*(1/l).  i*<epsilon>
+    ensures that no two subsequences have an index which shares a
+    mapping in [0, 1) as long as <epsilon> is chosen to be small
+    enough.
+
+    Rather than actually dealing with rational numbers, however, we'll
+    instead map onto whole numbers in [0, pseudo_size) where
+    pseudo_size is the lcm of the subsequence lengths * the number of
+    subsequences.  Including the number of subsequences in the product
+    allows us to use 1 as <epsilon>.  For each subsequence, we designate
+    an offset (position in input list) and a multiple (pseudo_size / size)
+    such that the psuedo_index for index i is <offset> + i*<multiple>.
+
+    I don't have a good way to map index to pseudo index, so we'll
+    precompute a mapping in the constructor (self._i_so_sis) from
+    index to (subset_index, subset).
      """
      def __init__(self, item, _submats):
          assert len(_submats) > 0, \
              "Sum requires non-empty _submats"
          self.item = item
  
-        submats = sorted(
-            [((i.size(), ind), i) for (i, ind) in
-             zip(_submats, range(len(_submats)))], reverse=True)
-        self.submats = []
-        self._size = 0
-        for ((size, ind), submat) in submats:
-            self.submats.append((self._size, submat))
-            self._size += size
-        self.submats.reverse()
+        self._pseudo_size = lcml((i.size() for i in _submats)) * len(_submats)
+        self._size = sum((i.size() for i in _submats))
+        self._submats = [
+            ((i, self._pseudo_size / s.size()), s) for (i, s) in \
+            zip(range(len(_submats)), _submats)
+        ]
+
+        def sm_to_pmsl(((offset, multiple), submat)):
+            """
+            submat tuple to pseudo minscanlen
+            """
+            return submat.minscanlen() * multiple
+
+        def index_to_pindex_generator(submats):
+            assert len(submats) > 0, "submats must be non-empty"
+            h = []
+            for (offset, multiple), submat in submats:
+                heapq.heappush(h, (offset, 0, multiple, submat))
+            while True:
+                cur, si, multiple, submat = heapq.heappop(h)
+                heapq.heappush(
+                    h,
+                    (cur + multiple, si + 1, multiple, submat))
+                yield si, submat
+
+        self._i_to_sis = dict(
+            zip(range(self._size), index_to_pindex_generator(self._submats))
+        )
+
+        self._minscanlen = self.pseudo_index_to_index(
+            max(map(sm_to_pmsl, self._submats)))
+
+    def pi_to_sis(self, pi, (offset, multiple)):
+        """
+        max(i) s.t. offset + i*multiple <= pi
+        """
+        if pi < offset:
+            return -1
+        return (pi - offset) / multiple
+
+    def pseudo_index_to_index(self, pi):
+        """
+        Count all pseudoindex values <= pi with corresponding subset indices
+        """
+        return sum((self.pi_to_sis(pi, i) + 1 for i, _ in self._submats)) - 1
  
-        self._minscanlen = max(
-            [(self._size / i.size()) *
-             i.minscanlen() for i in _submats])
      def tostr(self, depth):
          ret = '\t'*depth + "Sum({item}):\n".format(item=self.item)
          return ret + ''.join([i[1].tostr(depth+1) for i in self._submats])
@@ -224,41 +281,9 @@ class Sum(Matrix):
      def size(self):
          return self._size
  
-    def _index(self, _i, submats):
-        """
-        We reduce the N sequence problem to a two sequence problem recursively.
-
-        If we have two sequences M and N of length m and n (n > m wlog), we
-        want to mix an M item into the stream every N / M items.  Once we run
-        out of N, we want to simply finish the M stream.
-        """
-        assert len(submats) > 0, \
-            "_index requires non-empty submats"
-        if len(submats) == 1:
-            return submats[0][1].index(_i)
-        lmat = submats[0][1]
-        lsize = lmat.size()
-
-        rsize = submats[0][0]
-
-        mult = rsize / lsize
-        clen = mult + 1
-        thresh = lsize * clen
-        i = _i % (rsize + lsize)
-        base = (_i / (rsize + lsize))
-        if i < thresh:
-            if i % clen == 0:
-                return lmat.index((i / clen) + (base * lsize))
-            else:
-                return self._index(((i / clen) * mult + ((i % clen) - 1)) +
-                                   (base * rsize),
-                                   submats[1:])
-        else:
-            return self._index(i - lsize, submats[1:])
-
      def index(self, i):
-        return (self.item, self._index(i, self.submats))
-
+        si, submat = self._i_to_sis[i % self._size]
+        return (self.item, submat.index(si))
  
  def generate_lists(result):
      """
author	Samuel Just <sjust@redhat.com>
	Mon, 30 Nov 2015 19:13:48 +0000 (11:13 -0800)
committer	Loic Dachary <ldachary@redhat.com>
	Mon, 1 Feb 2016 05:27:48 +0000 (12:27 +0700)