]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: add more ephemeral pin tests
authorPatrick Donnelly <pdonnell@redhat.com>
Tue, 9 Jun 2020 22:28:21 +0000 (15:28 -0700)
committerPatrick Donnelly <pdonnell@redhat.com>
Wed, 24 Jun 2020 22:43:32 +0000 (15:43 -0700)
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
qa/suites/multimds/basic/tasks/cephfs_test_exports.yaml
qa/tasks/cephfs/cephfs_test_case.py
qa/tasks/cephfs/filesystem.py
qa/tasks/cephfs/test_exports.py

index 46334fe16d09eae9201c467b77e75fe8ff74af51..6eb6c987c36d08efc732509290b0343e7d9a3618 100644 (file)
@@ -1,3 +1,7 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - Replacing daemon mds
 tasks:
 - cephfs_test_runner:
     fail_on_skip: false
index eee6bb7a036015813462db52dbef98f89612a204..1c206dc592ec05122261a5e61b9132f103aa54e0 100644 (file)
@@ -288,13 +288,22 @@ class CephFSTestCase(CephTestCase):
         else:
             log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
 
-    def _get_subtrees(self, status=None, rank=None):
+    def _get_subtrees(self, status=None, rank=None, path=None):
+        if path is None:
+            path = "/"
         try:
             with contextutil.safe_while(sleep=1, tries=3) as proceed:
                 while proceed():
                     try:
-                        subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
-                        subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
+                        if rank == "all":
+                            subtrees = []
+                            for r in self.fs.get_ranks(status=status):
+                                s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
+                                s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
+                                subtrees += s
+                        else:
+                            subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
+                        subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
                         return list(subtrees)
                     except CommandFailedError as e:
                         # Sometimes we get transient errors
@@ -305,12 +314,12 @@ class CephFSTestCase(CephTestCase):
         except contextutil.MaxWhileTries as e:
             raise RuntimeError(f"could not get subtree state from rank {rank}") from e
 
-    def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None):
+    def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
         test = sorted(test)
         try:
             with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
                 while proceed():
-                    subtrees = self._get_subtrees(status=status, rank=rank)
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
                     filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
                     log.info("%s =?= %s", filtered, test)
                     if filtered == test:
@@ -332,42 +341,26 @@ class CephFSTestCase(CephTestCase):
                 if out_json['status'] == "no active scrubs running":
                     break;
 
-    def _wait_distributed_subtrees(self, status, rank, count):
-        timeout = 30
-        pause = 2
-        for i in range(timeout//pause):
-            subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
-            subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
-            subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == 1, subtrees))
-            if (len(subtrees) == count):
-                return subtrees
-            time.sleep(pause)
-        raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank))
-
-    def get_auth_subtrees(self, status, rank):
-        subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
-        subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
-        subtrees = filter(lambda s: s['auth_first'] == rank, subtrees)
-
-        return list(subtrees)
-
-    def get_ephemerally_pinned_auth_subtrees(self, status, rank):
-        subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
-        subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
-        subtrees = filter(lambda s: (s['distributed_ephemeral_pin'] == 1 or s['random_ephemeral_pin'] == 1) and (s['auth_first'] == rank), subtrees)
-
-        return list(subtrees)
-
-    def get_distributed_auth_subtrees(self, status, rank):
-        subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
-        subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
-        subtrees = filter(lambda s: (s['distributed_ephemeral_pin'] == 1) and (s['auth_first'] == rank), subtrees)
-
-        return list(subtrees)
-
-    def get_random_auth_subtrees(self, status, rank):
-        subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
-        subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
-        subtrees = filter(lambda s: (s['random_ephemeral_pin'] == 1) and (s['auth_first'] == rank), subtrees)
-
-        return list(subtrees)
+    def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
+        try:
+            with contextutil.safe_while(sleep=5, tries=20) as proceed:
+                while proceed():
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
+                    subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True, subtrees))
+                    log.info(f"len={len(subtrees)} {subtrees}")
+                    if len(subtrees) >= count:
+                        return subtrees
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
+
+    def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
+        try:
+            with contextutil.safe_while(sleep=5, tries=20) as proceed:
+                while proceed():
+                    subtrees = self._get_subtrees(status=status, rank=rank, path=path)
+                    subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True, subtrees))
+                    log.info(f"len={len(subtrees)} {subtrees}")
+                    if len(subtrees) >= count:
+                        return subtrees
+        except contextutil.MaxWhileTries as e:
+            raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
index bb8e41e83d2a10dde4f7097a767ce98a7fc29cca..29c168a4953a7055f48fa5f1a4a9f813cdd7cca4 100644 (file)
@@ -1021,8 +1021,18 @@ class Filesystem(MDSCluster):
     def ranks_tell(self, command, status=None):
         if status is None:
             status = self.status()
+        out = []
         for r in status.get_ranks(self.id):
-            self.rank_tell(command, rank=r['rank'], status=status)
+            result = self.rank_tell(command, rank=r['rank'], status=status)
+            out.append((r['rank'], result))
+        return sorted(out)
+
+    def ranks_perf(self, f, status=None):
+        perf = self.ranks_tell(["perf", "dump"], status=status)
+        out = []
+        for rank, perf in perf:
+            out.append((rank, f(perf)))
+        return out
 
     def read_cache(self, path, depth=None):
         cmd = ["dump", "tree", path]
index 47fb0e2992569e5d26a7a45345d5440bb7e6a700..3b0f5f3352c8e6cf9651315b784de783fbd08c09 100644 (file)
@@ -1,5 +1,7 @@
 import logging
+import random
 import time
+import unittest
 from tasks.cephfs.fuse_mount import FuseMount
 from tasks.cephfs.cephfs_test_case import CephFSTestCase
 from teuthology.orchestra.run import CommandFailedError, Raw
@@ -144,9 +146,9 @@ class TestExports(CephFSTestCase):
 
         self.fs.set_max_mds(2)
         status = self.fs.wait_for_daemons()
-        self.mount_a.run_shell(f"mkdir -p foo")
+        self.mount_a.run_shell_payload(f"mkdir -p foo")
         self.mount_a.setfattr(f"foo", "ceph.dir.pin", "0")
-        self.mount_a.run_shell(["bash", "-c", Raw(f"'mkdir -p foo/bar/baz && setfattr -n ceph.dir.pin -v 1 foo/bar'")])
+        self.mount_a.run_shell_payload(f"mkdir -p foo/bar/baz && setfattr -n ceph.dir.pin -v 1 foo/bar")
         self._wait_subtrees([('/foo/bar', 1), ('/foo', 0)], status=status)
         self.mount_a.umount_wait() # release all caps
         def _drop():
@@ -191,199 +193,358 @@ class TestExports(CephFSTestCase):
         new_rank1 = self.fs.get_rank(rank=1)
         self.assertEqual(rank1['gid'], new_rank1['gid'])
 
-    def test_ephememeral_pin_distribution(self):
+class TestEphemeralPins(CephFSTestCase):
+    MDSS_REQUIRED = 3
+    CLIENTS_REQUIRED = 1
 
-        # Check if the subtree distribution under ephemeral distributed pin is fairly uniform
+    def setUp(self):
+        CephFSTestCase.setUp(self)
 
-        self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
-
-        status = self.fs.status()
+        self.config_set('mds', 'mds_export_ephemeral_random', True)
+        self.config_set('mds', 'mds_export_ephemeral_distributed', True)
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 1.0)
 
-        self.mount_a.run_shell(["mkdir", "-p", "a"])
-        self._wait_subtrees(status, 0, [])
+        self.mount_a.run_shell_payload(f"""
+set -e
 
-        for i in range(0,100):
-          self.mount_a.run_shell(["mkdir", "-p", "a/" + str(i) + "/d"])
-          
-        self._wait_subtrees(status, 0, [])
-
-        self.mount_b.setfattr(["a", "ceph.dir.pin.distributed", "1"])
-
-        self._wait_distributed_subtrees([status, 0, 100])
-
-        # Check if distribution is uniform
-        rank0_distributed_subtree_ratio = len(self.get_distributed_auth_subtrees(status, 0))/len(self.get_auth_subtrees(status, 0))
-        self.assertGreaterEqual(rank0_distributed_subtree_ratio, 0.2)
+# Use up a random number of inode numbers so the ephemeral pinning is not the same every test.
+mkdir .inode_number_thrash
+count=$((RANDOM % 1024))
+for ((i = 0; i < count; i++)); do touch .inode_number_thrash/$i; done
+rm -rf .inode_number_thrash
+""")
 
-        rank1_distributed_subtree_ratio = len(self.get_distributed_auth_subtrees(status, 1))/len(self.get_auth_subtrees(status, 1))
-        self.assertGreaterEqual(rank1_distributed_subtree_ratio, 0.2)
-
-        rank2_distributed_subtree_ratio = len(self.get_distributed_auth_subtrees(status, 2))/len(self.get_auth_subtrees(status, 2))
-        self.assertGreaterEqual(rank2_distributed_subtree_ratio, 0.2)
-
-    def test_ephemeral_random(self):
-        
-        # Check if export ephemeral random is applied hierarchically
-        
         self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
+        self.status = self.fs.wait_for_daemons()
+
+    def _setup_tree(self, path="tree", export=-1, distributed=False, random=0.0, count=100, wait=True):
+        return self.mount_a.run_shell_payload(f"""
+set -e
+mkdir -p {path}
+{f"setfattr -n ceph.dir.pin -v {export} {path}" if export >= 0 else ""}
+{f"setfattr -n ceph.dir.pin.distributed -v 1 {path}" if distributed else ""}
+{f"setfattr -n ceph.dir.pin.random -v {random} {path}" if random > 0.0 else ""}
+for ((i = 0; i < {count}; i++)); do
+    mkdir -p "{path}/$i"
+    echo file > "{path}/$i/file"
+done
+""", wait=wait)
+
+    def test_ephemeral_pin_dist_override(self):
+        """
+        That an ephemeral distributed pin overrides a normal export pin.
+        """
 
-        status = self.fs.status()
+        self._setup_tree(distributed=True)
+        subtrees = self._wait_distributed_subtrees(100, status=self.status, rank="all")
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == '/tree':
+                self.assertEqual(s['export_pin'], 0)
+                self.assertEqual(s['auth_first'], 0)
+            elif path.startswith('/tree/'):
+                self.assertEqual(s['export_pin'], -1)
+                self.assertTrue(s['distributed_ephemeral_pin'])
+
+    def test_ephemeral_pin_dist_override_pin(self):
+        """
+        That an export pin overrides an ephemerally pinned directory.
+        """
 
-        tmp_dir = ""
-        for i in range(0, 100):
-          tmp_dir = tmp_dir + str(i) + "/"
-          self.mount_a.run_shell(["mkdir", "-p", tmp_dir])
-          self.mount_b.setfattr([temp_dir, "ceph.dir.pin.random", "1"])
+        self._setup_tree(distributed=True, export=0)
+        subtrees = self._wait_distributed_subtrees(100, status=self.status, rank="all", path="/tree/")
+        which = None
+        for s in subtrees:
+            if s['auth_first'] == 1:
+                path = s['dir']['path']
+                self.mount_a.setfattr(path[1:], "ceph.dir.pin", "0")
+                which = path
+                break
+        self.assertIsNotNone(which)
+        time.sleep(15)
+        subtrees = self._get_subtrees(status=self.status, rank=0)
+        for s in subtrees:
+            path = s['dir']['path']
+            if path == which:
+                self.assertEqual(s['auth_first'], 0)
+                self.assertFalse(s['distributed_ephemeral_pin'])
+                return
+        # it has been merged into /tree
+
+    def test_ephemeral_pin_dist_off(self):
+        """
+        That turning off ephemeral distributed pin merges subtrees.
+        """
 
-        count = len(get_random_auth_subtrees(status,0))
-        self.assertEqual(count, 100)
+        self._setup_tree(distributed=True, export=0)
+        self._wait_distributed_subtrees(100, status=self.status, rank="all")
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "0")
+        self._wait_subtrees([('/tree', 0)], status=self.status)
 
-    def test_ephemeral_pin_grow_mds(self):
-        
-        # Increase the no of MDSs and verify that the no of subtrees that migrate are less than 1/3 of the total no of subtrees that are ephemerally pinned
+    def test_ephemeral_pin_dist_conf_off(self):
+        """
+        That turning off ephemeral distributed pin config prevents distribution.
+        """
 
-        self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
+        self._setup_tree(export=0)
+        self.config_set('mds', 'mds_export_ephemeral_distributed', False)
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1")
+        time.sleep(30)
+        self._wait_subtrees([('/tree', 0)], status=self.status)
 
-        status = self.fs.status()
+    def test_ephemeral_pin_dist_conf_off_merge(self):
+        """
+        That turning off ephemeral distributed pin config merges subtrees.
+        """
 
-        for i in range(0,100):
-          self.mount_a.run_shell(["mkdir", "-p", "a/" + str(i) + "/d"])
-        self._wait_subtrees(status, 0, [])
-        self.mount_b.setfattr(["a", "ceph.dir.pin.distributed", "1"])
-        self._wait_distributed_subtrees([status, 0, 100])
+        self._setup_tree(distributed=True, export=0)
+        self._wait_distributed_subtrees(100, status=self.status)
+        self.config_set('mds', 'mds_export_ephemeral_distributed', False)
+        self._wait_subtrees([('/tree', 0)], timeout=60, status=self.status)
 
-        subtrees_old = dict(get_ephemrally_pinned_auth_subtrees(status, 0).items() + get_ephemrally_pinned_auth_subtrees(status, 1).items() + get_ephemrally_pinned_auth_subtrees(status, 2).items()) 
-        self.fs.set_max_mds(4)
-        self.fs.wait_for_daemons()
-        # Sleeping for a while to allow the ephemeral pin migrations to complete
-        time.sleep(15)
-        subtrees_new = dict(get_ephemrally_pinned_auth_subtrees(status, 0).items() + get_ephemrally_pinned_auth_subtrees(status, 1).items() + get_ephemrally_pinned_auth_subtrees(status, 2).items())
-        for old_subtree in subtrees_old:
-            for new_subtree in subtrees_new:
-                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
-                    count = count + 1
-                    break
+    def test_ephemeral_pin_dist_override_before(self):
+        """
+        That a conventional export pin overrides the distributed policy _before_ distributed policy is set.
+        """
 
-        assertLessEqual((count/subtrees_old), 0.33)
+        count = 10
+        self._setup_tree(count=count)
+        test = []
+        for i in range(count):
+            path = f"tree/{i}"
+            self.mount_a.setfattr(path, "ceph.dir.pin", "1")
+            test.append(("/"+path, 1))
+        self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1")
+        time.sleep(10) # for something to not happen...
+        self._wait_subtrees(test, timeout=60, status=self.status, rank="all", path="/tree/")
+
+    def test_ephemeral_pin_dist_override_after(self):
+        """
+        That a conventional export pin overrides the distributed policy _after_ distributed policy is set.
+        """
 
-    def test_ephemeral_pin_shrink_mds(self):
+        self._setup_tree(count=10, distributed=True)
+        subtrees = self._wait_distributed_subtrees(10, status=self.status, rank="all")
+        victim = None
+        test = []
+        for s in subtrees:
+            path = s['dir']['path']
+            auth = s['auth_first']
+            if auth in (0, 2) and victim is None:
+                victim = path
+                self.mount_a.setfattr(victim[1:], "ceph.dir.pin", "1")
+                test.append((victim, 1))
+            else:
+                test.append((path, auth))
+        self.assertIsNotNone(victim)
+        self._wait_subtrees(test, status=self.status, rank="all", path="/tree/")
+
+    def test_ephemeral_pin_dist_failover(self):
+        """
+        That MDS failover does not cause unnecessary migrations.
+        """
 
-        # Shrink the no of MDSs
+        # pin /tree so it does not export during failover
+        self._setup_tree(distributed=True, export=0)
+        subtrees = self._wait_distributed_subtrees(100, status=self.status, rank="all")
+        test = [(s['dir']['path'], s['auth_first']) for s in subtrees]
+        before = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {before}")
+        self.fs.rank_fail(rank=1)
+        self.status = self.fs.wait_for_daemons()
+        time.sleep(10) # waiting for something to not happen
+        after = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {after}")
+        self.assertEqual(before, after)
+
+    def test_ephemeral_pin_distribution(self):
+        """
+        That ephemerally pinned subtrees are somewhat evenly distributed.
+        """
 
         self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
+        self.status = self.fs.wait_for_daemons()
 
-        status = self.fs.status()
-
-        for i in range(0,100):
-          self.mount_a.run_shell(["mkdir", "-p", "a/" + str(i) + "/d"])
-        self._wait_subtrees(status, 0, [])
-        self.mount_b.setfattr(["a", "ceph.dir.pin.distributed", "1"])
-        self._wait_distributed_subtrees([status, 0, 100])
+        count = 1000
+        self._setup_tree(count=count, distributed=True)
+        subtrees = self._wait_distributed_subtrees(count, status=self.status, rank="all")
+        nsubtrees = len(subtrees)
 
-        subtrees_old = dict(get_ephemrally_pinned_auth_subtrees(status, 0).items() + get_ephemrally_pinned_auth_subtrees(status, 1).items() + get_ephemrally_pinned_auth_subtrees(status, 2).items())
-        self.fs.set_max_mds(2)
-        self.fs.wait_for_daemons()
-        time.sleep(15)
+        # Check if distribution is uniform
+        rank0 = list(filter(lambda x: x['auth_first'] == 0, subtrees))
+        rank1 = list(filter(lambda x: x['auth_first'] == 1, subtrees))
+        rank2 = list(filter(lambda x: x['auth_first'] == 2, subtrees))
+        self.assertGreaterEqual(len(rank0)/nsubtrees, 0.2)
+        self.assertGreaterEqual(len(rank1)/nsubtrees, 0.2)
+        self.assertGreaterEqual(len(rank2)/nsubtrees, 0.2)
 
-        subtrees_new = dict(get_ephemrally_pinned_auth_subtrees(status, 0).items() + get_ephemrally_pinned_auth_subtrees(status, 1).items() + get_ephemrally_pinned_auth_subtrees(status, 2).items())
-        for old_subtree in subtrees_old:
-            for new_subtree in subtrees_new:
-                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
-                    count = count + 1
-                    break
+    def test_ephemeral_random(self):
+        """
+        That 100% randomness causes all children to be pinned.
+        """
+        self._setup_tree(random=1.0)
+        self._wait_random_subtrees(100, status=self.status, rank="all")
 
-        assertLessEqual((count/subtrees_old), 0.33)
+    def test_ephemeral_random_max(self):
+        """
+        That the config mds_export_ephemeral_random_max is not exceeded.
+        """
 
-    def test_ephemeral_pin_unset_config(self):
+        r = 0.5
+        count = 1000
+        self._setup_tree(count=count, random=r)
+        subtrees = self._wait_random_subtrees(int(r*count*.75), status=self.status, rank="all")
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01)
+        self._setup_tree(path="tree/new", count=count)
+        time.sleep(30) # for something not to happen...
+        subtrees = self._get_subtrees(status=self.status, rank="all", path="tree/new/")
+        self.assertLessEqual(len(subtrees), int(.01*count*1.25))
+
+    def test_ephemeral_random_max_config(self):
+        """
+        That the config mds_export_ephemeral_random_max config rejects new OOB policies.
+        """
 
-        # Check if unsetting the distributed pin config results in every distributed pin being unset
+        self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01)
+        try:
+            p = self._setup_tree(count=1, random=0.02, wait=False)
+            p.wait()
+        except CommandFailedError as e:
+            log.info(f"{e}")
+            self.assertIn("Invalid", p.stderr.getvalue())
+        else:
+            raise RuntimeError("mds_export_ephemeral_random_max ignored!")
 
-        self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
+    def test_ephemeral_random_dist(self):
+        """
+        That ephemeral random and distributed can coexist with each other.
+        """
 
-        status = self.fs.status()
+        self._setup_tree(random=1.0, distributed=True, export=0)
+        self._wait_distributed_subtrees(100, status=self.status)
+        self._wait_random_subtrees(100, status=self.status)
 
-        for i in range(0, 10):
-            self.mount_a.run_shell(["mkdir", "-p", i +"/dummy_dir"])
-            self.mount_b.setfattr([i, "ceph.dir.pin.distributed", "1"])
+    def test_ephemeral_random_pin_override_before(self):
+        """
+        That a conventional export pin overrides the random policy before creating new directories.
+        """
 
-        self._wait_distributed_subtrees([status, 0, 10])
+        self._setup_tree(count=0, random=1.0)
+        self._setup_tree(path="tree/pin", count=10, export=1)
+        self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin")
 
-        self.fs.mds_asok(["config", "set", "mds_export_ephemeral_distributed_config", "false"])
-        # Sleep for a while to facilitate unsetting of the pins
-        time.sleep(15)
-        
-        for i in range(0, 10):
-            self.assertTrue(self.mount_a.getfattr(i, "ceph.dir.pin.distributed") == "0")
+    def test_ephemeral_random_pin_override_after(self):
+        """
+        That a conventional export pin overrides the random policy after creating new directories.
+        """
 
-    def test_ephemeral_distributed_pin_unset(self):
+        count = 10
+        self._setup_tree(count=0, random=1.0)
+        self._setup_tree(path="tree/pin", count=count)
+        self._wait_random_subtrees(count+1, status=self.status, rank="all")
+        self.mount_a.setfattr(f"tree/pin", "ceph.dir.pin", "1")
+        self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin")
 
-        # Test if unsetting the distributed ephemeral pin on a parent directory then the children directory should not be ephemerally pinned anymore
+    def test_ephemeral_randomness(self):
+        """
+        That the randomness is reasonable.
+        """
 
-        self.fs.set_max_mds(3)
-        self.fs.wait_for_daemons()
+        r = random.uniform(0.25, 0.75) # ratios don't work for small r!
+        count = 1000
+        self._setup_tree(count=count, random=r)
+        subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all")
+        time.sleep(30) # for max to not be exceeded
+        subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all")
+        self.assertLessEqual(len(subtrees), int(r*count*1.50))
 
-        status = self.fs.status()
+    def test_ephemeral_random_cache_drop(self):
+        """
+        That the random ephemeral pin does not prevent empty (nothing in cache) subtree merging.
+        """
 
-        for i in range(0, 10):
-            self.mount_a.run_shell(["mkdir", "-p", i +"/a/b"])
-            self.mount_b.setfattr([i, "ceph.dir.pin.distributed", "1"])
+        count = 100
+        self._setup_tree(count=count, random=1.0)
+        subtrees = self._wait_random_subtrees(count, status=self.status, rank="all")
+        self.mount_a.umount_wait() # release all caps
+        def _drop():
+            self.fs.ranks_tell(["cache", "drop"], status=self.status)
+        self._wait_subtrees([], status=self.status, action=_drop)
 
-        self._wait_distributed_subtrees([status, 0, 10])
+    def test_ephemeral_random_failover(self):
+        """
+        That the random ephemeral pins stay pinned across MDS failover.
+        """
 
-        for i in range(0, 10):
-            self.mount_a.run_shell(["mkdir", "-p", i +"/a/b"])
-            self.mount_b.setfattr([i, "ceph.dir.pin.distributed", "0"])
+        count = 100
+        r = 0.5
+        self._setup_tree(count=count, random=r, export=0)
+        # wait for all random subtrees to be created, not a specific count
+        time.sleep(30)
+        subtrees = self._wait_random_subtrees(1, status=self.status, rank=1)
+        test = [(s['dir']['path'], s['auth_first']) for s in subtrees]
+        before = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {before}")
+        self.fs.rank_fail(rank=1)
+        self.status = self.fs.wait_for_daemons()
+        time.sleep(30) # waiting for something to not happen
+        self._wait_subtrees(test, status=self.status, rank=1)
+        after = self.fs.ranks_perf(lambda p: p['mds']['exported'])
+        log.info(f"export stats: {after}")
+        self.assertEqual(before, after)
 
-        time.sleep(15)
+    def test_ephemeral_pin_grow_mds(self):
+        """
+        That consistent hashing works to reduce the number of migrations.
+        """
 
-        subtree_count = len(get_distributed_auth_subtrees(status, 0))
-        assertEqual(subtree_count, 0)
+        self.fs.set_max_mds(2)
+        self.status = self.fs.wait_for_daemons()
 
-    def test_ephemeral_standby(self):
+        self._setup_tree(distributed=True)
+        subtrees_old = self._wait_distributed_subtrees(100, status=self.status, rank="all")
 
-        # Test if the distribution is unaltered when a Standby MDS takes up a failed rank
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
         
-        # Need all my standbys up as well as the active daemons
-        self.wait_for_daemon_start()
-        status = self.fs.status()
-
-        for i in range(0, 10):
-            self.mount_a.run_shell(["mkdir", "-p", i +"/a/b"])
-            self.mount_b.setfattr([i, "ceph.dir.pin.distributed", "1"])
-
-        self._wait_distributed_subtrees([status, 0, 10])
-
-        original_subtrees = get_ephemerally_pinned_auth_subtrees(status, 0)
-
-        # Flush the journal for rank 0
-        self.fs.rank_asok(["flush", "journal"], rank=0, status=status)
-
-        (original_active, ) = self.fs.get_active_names()
-        original_standbys = self.mds_cluster.get_standby_daemons()
+        # Sleeping for a while to allow the ephemeral pin migrations to complete
+        time.sleep(30)
+        
+        subtrees_new = self._wait_distributed_subtrees(100, status=self.status, rank="all")
+        count = 0
+        for old_subtree in subtrees_old:
+            for new_subtree in subtrees_new:
+                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
+                    count = count + 1
+                    break
 
-        # Kill the rank 0 daemon's physical process
-        self.fs.mds_stop(original_active)
+        log.info("{0} migrations have occured due to the cluster resizing".format(count))
+        # ~50% of subtrees from the two rank will migrate to another rank
+        self.assertLessEqual((count/len(subtrees_old)), (0.5)*1.25) # with 25% overbudget
 
-        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+    def test_ephemeral_pin_shrink_mds(self):
+        """
+        That consistent hashing works to reduce the number of migrations.
+        """
 
-        # Wait until the monitor promotes his replacement
-        def promoted():
-            active = self.fs.get_active_names()
-            return active and active[0] in original_standbys
+        self.fs.set_max_mds(3)
+        self.status = self.fs.wait_for_daemons()
 
-        log.info("Waiting for promotion of one of the original standbys {0}".format(
-            original_standbys))
-        self.wait_until_true(
-            promoted,
-            timeout=grace*2)
+        self._setup_tree(distributed=True)
+        subtrees_old = self._wait_distributed_subtrees(100, status=self.status, rank="all")
 
-        self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+        self.fs.set_max_mds(2)
+        self.status = self.fs.wait_for_daemons()
+        time.sleep(30)
 
-        new_subtrees = get_ephemerally_pinned_auth_subtrees(status, 0)
+        subtrees_new = self._wait_distributed_subtrees(100, status=self.status, rank="all")
+        count = 0
+        for old_subtree in subtrees_old:
+            for new_subtree in subtrees_new:
+                if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']):
+                    count = count + 1
+                    break
 
-        assertEqual(original_subtrees, new_subtrees)
+        log.info("{0} migrations have occured due to the cluster resizing".format(count))
+        # rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2
+        self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget